## 1. Reading data

In [1]:
import pandas as pd

In [2]:
dat6 = pd.read_csv('dat6.csv')

In [9]:
# an example data of a user (id = 2)
i = 2
data = dat6.loc[dat6.newid == i, :]

In [6]:
def MFU(data):
    
    from random import sample, seed
    
    # 1. Prepare train and test datasets
    
    data1 = data.sort_values('startTimeMillis')
    data1.date = data1.date.astype(int)
    
    data2 = data1.groupby('ids').count()
    data2.reset_index(level = 0, inplace = True)
    
    ses2 = data2.loc[data2.startTime > 1,:]
    
    ses2_list = list(ses2.ids.unique())
    
    data2s = data1.loc[data1.ids.isin(ses2_list)]
    
    list_apps = list(data2s.app_name.unique())
    
    ### Cross validation k = 5
    
    n = len(data2s.ids.unique()) # number of sessions
    k = 5
    m = n//k
    set_sessions = set(ses2_list)
    
    train = []
    test = []
    left = set_sessions.copy()
    
    for i in range(5):
        if i < 4:
            seed(123)
            tst = set(sample(left, m))
            trn = set_sessions - tst

            train_data = data2s.loc[data2s.ids.isin(trn)]
            test_data = data2s.loc[data2s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)

            left -= tst
        else:
            tst = left
            trn = set_sessions - tst

            train_data = data2s.loc[data2s.ids.isin(trn)]
            test_data = data2s.loc[data2s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)

            
    # 2. Functions needed 
    
    # function count the walks from this to that in the list_walks
    def count_walks(this, that, list_walks):
        r = 0
        for walks in list_walks:
            n = len(walks)
            for i in range(n-1):           
                if (walks[i] == this) & (walks[i+1] == that):
                    r +=1
        return r


    # function predict the next app with the highest counting
    def next_apps(data_train, napps):

        dt = data_train.groupby('app_name').count().sort_values('ids', ascending=False)
        dt.reset_index(level = 0, inplace = True)
        
        if len(dt) == 0:
            return []   
        elif len(dt) == 1:
            result = [dt.iloc[0,0]]
        elif len(dt) == 2:
            result = list(dt.iloc[:2,0])
        elif len(dt) == 3:
            result = list(dt.iloc[:3,0])
        elif len(dt) == 4:
            result = list(dt.iloc[:4,0])
        else:
            result = list(dt.iloc[:5,0])

        ind = min(napps, len(result))

        return result[:ind]

    def hit_rate_result(data_test, next_app_mostFrq):
        ses_list_test = list(data_test.ids.unique())

        list_walks_test = [] # keep the list of walks 
        for s in ses_list_test:

            d = data_test.loc[data_test['ids'] == s,]
            list_walks_test.append(list(d['app_name'])) 

        total, hit = 0, 0

        for walks in list_walks_test:      
            total += len(walks) - 1     
            for walk in walks[1:]: 
                if walk in next_app_mostFrq:
                    hit += 1

        return round(hit/total, 4)

    
    # 3. Prediction Accuracy
    
    total_hit_rate = [0, 0, 0, 0, 0] # to keep the accuracy of LU1 model to predict number of apps
    
    for i in range(len(train)): 
        
        data_train = train[i]
        ses2_list = list(data_train.ids.unique())

        list_walks = [] # keep the list of walks training
        for s in ses2_list:

            d = data_train.loc[data_train['ids'] == s,]
            list_walks.append(list(d['app_name']))  

        next_app_mostFrq = [next_apps(data_train, napps) for napps in range(1,6)]

        data_test = test[i]
        
        for z in range(5):
            
            total_hit_rate[z] += hit_rate_result(data_test, next_app_mostFrq[z])
    
    return [round(total_hit_rate[i] /len(train), 4) for i in range(5)]

### Test with id = 2

In [10]:
import time
start = time.time() 

accuracy = MFU(data)

end = time.time() 
print(end - start)

2.6533010005950928


In [11]:
accuracy

[0.3776, 0.5153, 0.6536, 0.6946, 0.7107]

### 3. Run and Save result

In [None]:
import time
start = time.time() 

columns = [ 'MFU1', 'MFU2', 'MFU3', 'MFU4', 'MFU5']
Result = pd.DataFrame(index = range(1,2010), columns = columns)

import time
start = time.time()

for i in range(1,2010):
    data = dat6.loc[dat6.newid == i, :]
    
    accuracy = MFU(data)
     
    for j in range(5):
        Result.iloc[i-1, j] = accuracy[j]

Result.to_excel('MFU_2009.xlsx')

end = time.time() 
print(end - start) 