use last current app, hour, day to predict next app 

### 1. Reading data

In [1]:
import pandas as pd

In [2]:
dat6 = pd.read_csv('dat6.csv')

### 2. Functions

In [24]:
def LUT(data):
    
    from random import sample, seed
    
    
    data1 = data.copy()
    data1['startH'] = data1.startTime.str[11:13].astype(int)
    data1.loc[data1['startH']<=6,'startHpoint']='EM' # early morning
    data1.loc[(data1['startH']>6) & (data1['startH']<=12),'startHpoint']='M' # morning
    data1.loc[(data1['startH']>12 )& (data1['startH']<=18),'startHpoint']='A' # afternoon
    data1.loc[data1['startH']>18,'startHpoint']='E' # evening
    data1['day'] =  (data1['date']+2)%7
    data1.loc[data1['day']<=4,'weekday']= 1 # weekday
    data1.loc[data1['day']>4,'weekday'] = 0 # weekend
   

    data1 = data1.sort_values('startTimeMillis')

    data2 = data1.groupby('ids').count()
    data2.reset_index(level = 0, inplace = True)

    ses2 = data2.loc[data2.startTime >= 2,:]
    ses2_set = set(ses2.ids)
    data2s = data1.loc[data1.ids.isin(ses2_set)]

    list_apps = list(data2s.app_name.unique())

    ### Cross validation k = 5
    n = len(ses2_set)
    k=5
    m = n//k

    train = []
    test = []
    left = ses2_set.copy()

    for i in range(5):
        if i < 4:
            seed(123)
            tst = set(sample(left, m))
            trn = ses2_set - tst

            train_data = data2s.loc[data2s.ids.isin(trn)]
            test_data = data2s.loc[data2s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)

            left -= tst
        else:
            tst = left
            trn = ses2_set - tst

            train_data = data2s.loc[data2s.ids.isin(trn)]
            test_data = data2s.loc[data2s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data) 

        
    # function count the walks from current app in the list_walks

    def countc(c, a):
        r = 0
        for walks in list_walks:
            n = len(walks)
            for i in range(n-1):           
                if (walks[i][0] == c) & (walks[i+1][0] == a):
                    r +=1
        return r

    # function calculate the conditional probability app a given current app c
    def pc(a, c):
        denominator = sum(countc(c, a) for a in list_apps)
        if denominator == 0:
            return 0.00
        numerator = countc(c,a)
        return round(numerator/denominator, 2)



    # function calculate the condition probability app given day, time
    def count(app, hour, day): # hour = 'EM', 'M', 'A', 'E'; day: 1 weekday, 0 weekend
        r = 0
        for walks in list_walks:
            for walk in walks:           
                if (walk[0] == app) & (walk[1] == hour)& (walk[2] == day):
                    r +=1
        return r

    def phd(app, hour, day):

        denominator = sum(walk.count(app) for walks in list_walks for walk in walks)
        if denominator == 0:
            return 0.00

        numerator = count(app, hour, day)
        return round(numerator/denominator, 2)


    # function calculate score of each app 
    def score(app, c, hour, day):
        return pc(app, c)* phd(app, hour, day)


    def leadto(c, hour, day):
        destination = [(app, score(app, c, hour, day)) for app in list_apps if score(app, c, hour, day)>0]
        if len(destination) == 0:
            return None
        dtdes = pd.DataFrame({'app': [a[0] for a in destination], 'score': [a[1] for a in destination]})
        return dtdes.sort_values('score', ascending = False)


    # function predict the next app with the highest counting
    def next_apps(c, hour, day, napps):

        dt = leadto(c, hour, day)
        if dt is None:
            return set()   
        if len(dt) == 1:
            result = [dt.iloc[0,0]]
        elif len(dt) == 2:
            result = list(dt.iloc[:2,0])
        elif len(dt) == 3:
            result = list(dt.iloc[:3,0])
        elif len(dt) == 4:
            result = list(dt.iloc[:4,0])
        else:
            result = list(dt.iloc[:5,0])

        ind = min(napps, len(result))

        return result[:ind]
    
    # using the re_dict to check the test set
    def hit_rate_result(re_dict, list_walks_test):
        total, hit = 0, 0

        for walks in list_walks_test:

            total += len(walks) - 1

            for i in range(len(walks) -1): 
                if walks[i] in re_dict:
                    if walks[i+1][0] in re_dict[walks[i]]:
                        hit += 1

        return round(hit/total, 4)

    total_hit_rate = [0, 0, 0, 0, 0]
    
    
    for i in range(len(train)): 
        
        data_train = train[i]
        ses2_list = list(data_train.ids.unique())

        list_walks = [] # keep the list of walks training
        for s in ses2_list:

            d = data_train.loc[data_train['ids'] == s,]
            b = list(zip(list(d['app_name']), list(d['startHpoint']),list(d['weekday']) ))
            list_walks.append(b)  

        list_hours =  ['EM', 'M', 'A', 'E']
        # dictionary to keep the result of model training
        re_dict_l = [{(c, hour, day): next_apps(c, hour, day, napps) for c in list_apps 
                   for hour in list_hours for day in [0, 1] } for napps in range(1,6)]

        
        data_test = test[i]
        ses_list_test = list(data_test.ids.unique())

        list_walks_test = [] # keep the list of walks testing
        for s in ses_list_test:

            d = data_test.loc[data_test['ids'] == s,]
            b = list(zip(list(d['app_name']), list(d['startHpoint']),list(d['weekday'])))
            list_walks_test.append(b) 
        
        for i in range(5):
            total_hit_rate[i] += hit_rate_result(re_dict_l[i], list_walks_test)
        
    return [round(ac/5, 2) for ac in total_hit_rate]


#### Running with 2009 users

In [None]:
columns = [ 'LUT1', 'LUT2', 'LUT3', 'LUT4', 'LUT5']
Result2009 = pd.DataFrame(index = range(1,2010), columns = columns)

import time
start = time.time()

for i in range(1,2010):
    data = dat6.loc[dat6.newid == i, :]    
    accuracy = LUT(data)       
    for j in range(5):
        Result2009.iloc[i-1, j] = accuracy[j]

Result2009.to_excel('LUT_2009.xlsx')
end = time.time()
print(end - start)