### 1. Reading data

In [1]:
import pandas as pd

In [2]:
dat6 = pd.read_csv('dat6.csv')

In [14]:
# an example data of a user (id = 1)
i = 1
data = dat6.loc[dat6.newid == i, :]

### 2. Function calculate accuracy after cross validation

In [11]:
def LU2(data):
    
    from random import sample, seed
    
    # 1. Prepare train and test datasets
    
    data1 = data.sort_values('startTimeMillis')
    data1.date = data1.date.astype(int)
    
    data2 = data1.groupby('ids').count()
    data2.reset_index(level = 0, inplace = True)
    
    ses3 = data2.loc[data2.startTime >= 3,:]
    ses3_set = set(ses3.ids)
    data3s = data1.loc[data1.ids.isin(ses3_set)]
    
    list_apps = list(data3s.app_name.unique())
    
    ### Cross validation k = 5
    n = len(ses3_set)
    k=5
    m = n//k
    
    train = []
    test = []
    left = ses3_set.copy()

    for i in range(5):
        if i < 4:
            seed(123)
            tst = set(sample(left, m))
            trn = ses3_set - tst

            train_data = data3s.loc[data3s.ids.isin(trn)]
            test_data = data3s.loc[data3s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)

            left -= tst
        else:
            tst = left
            trn = ses3_set - tst

            train_data = data3s.loc[data3s.ids.isin(trn)]
            test_data = data3s.loc[data3s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)
        
    # 2. Functions needed 
    
    # function count the walks from a, b to c in the list_walks
    def count_walks(a, b, c, list_walks):
        r = 0
        for walks in list_walks:
            n = len(walks)
            for i in range(n-2):           
                if (walks[i] == a) & (walks[i+1] == b) & (walks[i+2] == c):
                    r +=1
        return r


    # function predict the next app with the highest counting
    def next_apps(a, b, list_apps, list_walks, napps):

        total_walks = [count_walks(a, b, c, list_walks) for c in list_apps]

        dt = pd.DataFrame(
        {'next_app': list_apps,
         'total_walks': total_walks
         })
        dt = dt.sort_values('total_walks', ascending=False)

        if len(dt) == 0:
            return []   
        elif len(dt) == 1:
            result = [dt.iloc[0,0]]
        elif len(dt) == 2:
            result = list(dt.iloc[:2,0])
        elif len(dt) == 3:
            result = list(dt.iloc[:3,0])
        elif len(dt) == 4:
            result = list(dt.iloc[:4,0])
        else:
            result = list(dt.iloc[:5,0])

        ind = min(napps, len(result))

        return result[:ind]


    # using the re_dict to check the test set
    def hit_rate_result(re_dict, list_walks_test):
        total, hit = 0, 0

        for walks in list_walks_test:

            total += len(walks) - 2

            for i in range(len(walks) -2): 
                if (walks[i],walks[i+1]) in re_dict:
                    if walks[i+2] in re_dict[(walks[i],walks[i+1])]:
                        hit += 1

        return round(hit/total, 4)


    # 3. Prediction Accuracy

    total_hit_rate = [0, 0, 0, 0, 0] # to keep the accuracy of LU1 model to predict number of apps
    
    for i in range(len(train)): 
        
        data_train = train[i]
        ses3_list = list(data_train.ids.unique())

        list_walks = [] # keep the list of walks training
        for s in ses3_list:

            d = data_train.loc[data_train['ids'] == s,]
            list_walks.append(list(d['app_name']))  

            
            # dictionary to keep the result of model training
        re_dict_l = [{(a,b): next_apps(a,b, list_apps, list_walks, napps) for a in list_apps for b in list_apps }
                     for napps in range(1,6)]


        data_test = test[i]
        ses_list_test = list(data_test.ids.unique())

        list_walks_test = [] # keep the list of walks testing
        for s in ses_list_test:

            d = data_test.loc[data_test['ids'] == s,]
            list_walks_test.append(list(d['app_name']))   
                
        for z in range(5):
            total_hit_rate[z] += hit_rate_result(re_dict_l[z], list_walks_test)

            
    return [round(total_hit_rate[i] /len(train), 4) for i in range(5)]

### Test with id = 1

In [15]:
import time
start = time.time() 

accuracy = LU2(data)

end = time.time() 
print(end - start) 

187.91833806037903


In [16]:
accuracy

[0.7519, 0.8823, 0.9471, 0.9634, 0.9681]

### 3. Run and save result

In [None]:
import time
start = time.time() 

columns = [ 'LU2_1', 'LU2_2', 'LU2_3', 'LU2_4', 'LU2_5']
Result = pd.DataFrame(index = range(1,2010), columns = columns)

import time
start = time.time()

for i in range(1,2010):
    data = dat6.loc[dat6.newid == i, :]
    
    accuracy = LU2(data)
     
    for j in range(5):
        Result.iloc[i-1, j] = accuracy[j]

Result.to_excel('LU2_2009.xlsx')

end = time.time() 
print(end - start) 