use last two apps to predict next app in linear model, mu = 0.5

## 1. Reading and exploring data

In [1]:
import pandas as pd
from random import sample, seed

In [2]:
dat6 = pd.read_csv('dat6.csv')

### function calculate accuracy after cross validation

In [3]:
def BN(data):
    
    # prepare train and test datasets
    
    data1 = data.sort_values('startTimeMillis')
    
    data2 = data1.groupby('ids').count()
    data2.reset_index(level = 0, inplace = True)
    
    ses3 = data2.loc[data2.startTime >= 3,:]
    ses3_set = set(ses3.ids)
    data3s = data1.loc[data1.ids.isin(ses3_set)]
    
    list_apps = list(data3s.app_name.unique())
    
    ### Cross validation k = 5
    n = len(ses3_set)
    k=5
    m = n//k
    
    train = []
    test = []
    left = ses3_set.copy()

    for i in range(5):
        if i < 4:
            seed(123)
            tst = set(sample(left, m))
            trn = ses3_set - tst

            train_data = data3s.loc[data3s.ids.isin(trn)]
            test_data = data3s.loc[data3s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)

            left -= tst
        else:
            tst = left
            trn = ses3_set - tst

            train_data = data3s.loc[data3s.ids.isin(trn)]
            test_data = data3s.loc[data3s.ids.isin(tst)]

            train.append(train_data)
            test.append(test_data)
        
        
    # list of functions needed
        
    # function count the walks from current app in the list_walks
    def countc(c, a):
        r = 0
        for walks in list_walks:
            n = len(walks)
            for i in range(n-1):           
                if (walks[i] == c) & (walks[i+1] == a):
                    r +=1
        return r

    # function to calculate the conditional probability app a give current app c
    def pc(a, c):
        denominator = sum(countc(c,a) for a in list_apps)
        if denominator == 0:
            return 0.00
        numerator = countc(c,a)
        return round(numerator/denominator, 2)


    # function count the walks from previous app in the list_walks
    def countp(p, a):
        r = 0
        for walks in list_walks:
            n = len(walks)
            for i in range(n-2):           
                if (walks[i] == p) & (walks[i+2] == a):
                    r +=1
        return r

    # function to calculate the conditional probability app a give previous app p
    def pp(a, p):
        denominator = sum(countp(p,a) for a in list_apps)
        if denominator == 0:
            return 0.00
        numerator = countp(p,a)
        return round(numerator/denominator, 2)


    # function calculate score of each app a, mu can be 0 to 1, here default is 0.5
    def score(a,c,p, mu=0.5):
        return mu*pc(a,c) +(1-mu)*pp(a,p)

    def leadto(c,p):
        destination = [(a,score(a,c,p)) for a in list_apps if score(a,c,p)>0]
        if len(destination) == 0:
            return None
        dtdes = pd.DataFrame({'app': [a[0] for a in destination], 'score': [a[1] for a in destination]})
        return dtdes.sort_values('score', ascending = False)

    # function predict the next apps with the highest counting, 1<= napps <=5
    def next_apps(p,c, napps):
        dt = leadto(c,p)
        
        if dt is None:
            return set()   
        if len(dt) == 1:
            result = [dt.iloc[0,0]]
        elif len(dt) == 2:
            result = list(dt.iloc[:2,0])
        elif len(dt) == 3:
            result = list(dt.iloc[:3,0])
        elif len(dt) == 4:
            result = list(dt.iloc[:4,0])
        else:
            result = list(dt.iloc[:5,0])

        ind = min(napps, len(result))
    
        return result[:ind]


    # using the re_dict to check the test set
    def hit_rate_result(re_dict, list_walks_test):
        total, hit = 0, 0

        for walks in list_walks_test:

            total += len(walks) - 2

            for i in range(len(walks) -2): 
                if (walks[i],walks[i+1]) in re_dict:
                    if walks[i+2] in re_dict[(walks[i],walks[i+1])]:
                        hit += 1

        return round(hit/total, 4)
    
        
    total_hit_rate = [0, 0, 0, 0, 0] # to keep the accuracy of BN model to predict number of apps
    
    for i in range(len(train)): 
        
        data_train = train[i]
        ses3_list = list(data_train.ids.unique())

        list_walks = [] # keep the list of walks training
        for s in ses3_list:

            d = data_train.loc[data_train['ids'] == s,]
            list_walks.append(list(d['app_name']))  

    
        # dictionary to keep the result of model training
        re_dict_l = [{(p,c): next_apps(p,c, napps) for c in list_apps for p in list_apps } for napps in range(1,6)]


        data_test = test[i]
        ses_list_test = list(data_test.ids.unique())

        list_walks_test = [] # keep the list of walks testing
        for s in ses_list_test:

            d = data_test.loc[data_test['ids'] == s,]
            list_walks_test.append(list(d['app_name'])) 
        
        for i in range(5):

            total_hit_rate[i] += hit_rate_result(re_dict_l[i], list_walks_test)

    return [round(total_hit_rate[i] /len(train), 4) for i in range(5)]

#### 100 sample

In [4]:
from random import sample, seed
seed(123)
sampl = sample(range(1,2010),100)
sampl.sort()
columns = [ 'BN1', 'BN2', 'BN3', 'BN4', 'BN5']
Result = pd.DataFrame(index = sampl, columns = columns)

import time
start = time.time()

for k, i in enumerate(sampl):
    data = dat6.loc[dat6.newid == i, :]
    
    accuracy = BN(data)
   
    
    for j in range(5):
        Result.iloc[k, j] = accuracy[j]

Result.to_excel('BN_100sample.xlsx')
end = time.time()
print(end - start)


637199.7911667824
