In [1]:
import datetime
import matplotlib
import numpy as np
import pandas as pd
import pyreadr

import os

In [2]:
USER_KEY = 'userID'
ITEM_KEY = 'appID'
TIME_KEY = 'timestamp'
SESSION_KEY = 'sessionID'

In [5]:
# path_results = '../results/all/'
# path_results = '../results/drop_on/'
path_results = '../results/drop_on-drop_off/'

test_data = pd.read_hdf('../../data/preprocessed/' + 'events-1-drop_on-drop_off.hdf', 'test') # make sure this aligns with path_results

In [6]:
results_files = [f for f in os.listdir(path_results) if 'Saver' not in f]
results = pd.DataFrame()
for file in results_files:
    df = pd.read_csv(path_results + file, sep = ';')
    # TBD: extract window number from file name, create new column
    df.drop(['Metrics', 'Saver@50: '], axis=1, inplace=True)
    df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True) # drop 'Unnamed: 24' column containing only NaNs
    df.rename(columns = lambda x : str(x)[:-2], inplace=True) # remove colon and whitespace from all column names
    df.insert(0, 'model', "_".join(file.split('_')[2:-1]))
    results = results.append(df)

In [7]:
results

Unnamed: 0,model,MAP@1,MAP@5,MAP@10,MAP@20,Precision@1,Precision@5,Precision@10,Precision@20,Recall@1,...,HitRate@1,HitRate@5,HitRate@10,HitRate@20,MRR@1,MRR@5,MRR@10,MRR@20,Coverage@20,Popularity@20
0,gru4rec,0.0,0.054402,0.036659,0.023861,0.286325,0.163533,0.103704,0.065812,0.087556,...,0.192308,0.490028,0.579772,0.683761,0.192308,0.301401,0.313027,0.320133,0.853503,0.033256
0,hgru4rec,0.0,0.046794,0.031464,0.020265,0.448718,0.150712,0.097009,0.057336,0.150209,...,0.290598,0.413105,0.5,0.554131,0.290598,0.335708,0.346614,0.350346,0.871019,0.021235
0,sr_B,0.0,0.12109,0.081158,0.044985,0.618234,0.353561,0.216382,0.116453,0.193634,...,0.378917,0.74359,0.839031,0.896011,0.378917,0.516809,0.529859,0.534167,0.484076,0.134468
0,sr,0.0,0.11014,0.074542,0.044245,0.559829,0.325356,0.203561,0.116453,0.176299,...,0.350427,0.702279,0.799145,0.896011,0.350427,0.479463,0.49321,0.499802,0.484076,0.134468
0,stan,0.0,0.121739,0.080387,0.045405,0.660969,0.357835,0.21396,0.117806,0.175622,...,0.253561,0.74359,0.836182,0.887464,0.253561,0.452493,0.465163,0.468906,0.458599,0.137246
0,vsknn_EB,0.0,0.111404,0.075869,0.044452,0.65812,0.335328,0.208974,0.118519,0.186177,...,0.25641,0.695157,0.786325,0.856125,0.25641,0.432526,0.444706,0.449593,0.294586,0.140458
0,vsknn,0.0,0.068514,0.045129,0.02634,0.437322,0.243875,0.14416,0.082835,0.101896,...,0.160969,0.424501,0.48433,0.538462,0.160969,0.266453,0.274543,0.278398,0.550955,0.041062
0,vstan,0.0,0.11923,0.073747,0.042088,0.696581,0.34359,0.196866,0.108191,0.203976,...,0.280627,0.723647,0.801994,0.851852,0.280627,0.461728,0.472535,0.47602,0.570064,0.104548


### extracting predictions

In [8]:
# create app and user mappings
data_sa = pd.read_csv('../../data/data_raw.csv')

app_mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data_sa['app_name'])))])
app_mapping_reverse = dict((v,k) for k,v in app_mapping.items())
user_mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data_sa['userId'])))])
user_mapping_reverse = dict((v,k) for k,v in user_mapping.items())

In [16]:
data_sa.head(50)

Unnamed: 0,userId,timestamp,date,activity,category,sequence_number,app_name,sessionID
0,01ADD53B,1511423000.0,2017-11-23 07:44:19.952000000,SCREEN,ON_LOCKED,1.0,ON_LOCKED,1
1,01ADD53B,1511423000.0,2017-11-23 07:44:20.289999872,SCREEN,OFF_LOCKED,1.0,OFF_LOCKED,1
2,01ADD53B,1511424000.0,2017-11-23 08:02:01.137000192,SCREEN,ON_LOCKED,2.0,ON_LOCKED,2
3,01ADD53B,1511424000.0,2017-11-23 08:02:13.938999808,SCREEN,OFF_LOCKED,2.0,OFF_LOCKED,2
4,01ADD53B,1511424000.0,2017-11-23 08:02:51.140999936,SCREEN,ON_LOCKED,3.0,ON_LOCKED,3
5,01ADD53B,1511424000.0,2017-11-23 08:02:53.560000000,SMS,INBOX,3.0,INBOX,3
6,01ADD53B,1511424000.0,2017-11-23 08:04:01.191000064,SCREEN,ON_UNLOCKED,3.0,ON_UNLOCKED,3
7,01ADD53B,1511424000.0,2017-11-23 08:05:35.924999936,SCREEN,OFF_LOCKED,3.0,OFF_LOCKED,3
8,01ADD53B,1511426000.0,2017-11-23 08:28:01.295000064,SCREEN,ON_LOCKED,4.0,ON_LOCKED,4
9,01ADD53B,1511426000.0,2017-11-23 08:28:04.204999936,APPS,Settings,4.0,Settings_2,4


In [15]:
user_mapping_reverse[1]

'01ADD53B'

In [9]:
# extract ground truth from test data (test_data) for a single item (position) in a single session (sessionId)
def extract_ground_truth(sessionID, position, test_data):
    relevant_df = test_data[test_data[SESSION_KEY]==sessionID]
    index = relevant_df.index[position+1]
    ground_truth = relevant_df[ITEM_KEY][index]
    return ground_truth

In [10]:
predictions_files = [f for f in os.listdir(path_results) if 'Saver' in f]
predictions = pd.DataFrame()
for file in predictions_files:
    model = "_".join(file.split('_')[2:-1])
    df = pd.read_csv(path_results + file, sep = ';')
    if 'sessionID' not in predictions.columns:
        predictions['sessionID'] = df['SessionId']
    if 'position' not in predictions.columns:
        predictions['position'] = df['Position']
    if 'ground_truth' not in predictions.columns:
        predictions['ground_truth'] = predictions.apply(lambda x: extract_ground_truth(x['sessionID'], x['position'], test_data), axis=1)
        predictions['ground_truth_name'] = predictions['ground_truth'].apply(lambda x: app_mapping_reverse[x])
    predictions['recs-' + model] = df['Recommendations'].apply(lambda x: [int(i) for i in x.split(',')])
    predictions['recs_names-' + model] = predictions['recs-' + model].apply(lambda x: [app_mapping_reverse[i] for i in x])
    predictions['scores-' + model] = df['Scores'].apply(lambda x: x.split(','))

In [11]:
def print_predictions(predictions, sessionID, num_recs, models):
    # predictions must contain columns named 'sessionID' and 'position', containing the respective values
    predictions_dict = {}
    for pos in positions:
        row = predictions[(predictions.sessionID == sessionID) & (predictions.position == pos)]
        ground_truth = row.ground_truth_name.to_string(index=False)
#         print('sessionID: ' + str(sessionID) + ', position: ' + str(pos))
#         print('ground truth: ' + str(row.ground_truth_name.to_string(index=False)))
        df = pd.DataFrame()
        for model in models:
            df[model] = [row['recs_names-' + model].tolist()[0][i] for i in range(num_recs)]
        name = str(sessionID) + '_' + str(pos)
        predictions_dict[name] = (sessionID, pos, ground_truth, df)
    return predictions_dict

In [13]:
sessionID = predictions.sessionID[0]
num_recs = 5
positions = predictions.position[predictions.sessionID==sessionID]
models = results.model.tolist()

In [14]:
print_predictions(predictions, sessionID, num_recs, models)

{'2036_0': (2036,
  0,
  'Messaging_1',
         gru4rec        hgru4rec                sr_B                 sr  \
  0  Launcher_32    System_App_2         Messaging_1        Messaging_1   
  1   Launcher_4    System_App_6          Launcher_1         Launcher_1   
  2  Messaging_8  Organisation_3  Internet_Browser_1        Launcher_29   
  3  Messaging_1     Messaging_8   Social_Networks_2        Launcher_22   
  4   Settings_7    Messaging_12         Launcher_29  Social_Networks_1   
  
                  stan            vsknn_EB               vsknn             vstan  
  0        Messaging_1         Messaging_1          Launcher_4       Messaging_1  
  1         Launcher_1          Launcher_1         Launcher_40    Orientation_45  
  2        Launcher_29  Internet_Browser_1   Social_Networks_1  Calendar_Apps_25  
  3        Launcher_22         Launcher_36         Launcher_14  News_Magazines_9  
  4  Social_Networks_3   Social_Networks_1  Internet_Browser_1        Gallery_25  )}

In [213]:
# predictions.apply(lambda x: x['recs-sr_B'][0] == x['ground_truth'], axis=1).sum()/len(predictions)

0.2569832402234637