In [22]:
import datetime
import matplotlib
import numpy as np
import pandas as pd
import pyreadr

import os

In [23]:
USER_KEY = 'userID'
TIME_KEY = 'timestamp'
ITEM_KEY = 'usID'
SESSION_KEY = 'sentenceID'

In [69]:
path_results = '../results/sequence-level/single/drop_seq_onoff_all/untuned/'
# path_results = '../results/drop_on/untuned/'
# path_results = '../results/drop_on-drop_off/untuned/'

test_data = pd.read_hdf('../../data/sequence-level/single/' + 'events-seq_drop_onoff_all.hdf', 'test') # make sure this aligns with path_results

In [70]:
results_files = [f for f in os.listdir(path_results) if 'Saver' not in f]
results = pd.DataFrame()
for file in results_files:
    df = pd.read_csv(path_results + file, sep = ';')
    # TBD: extract window number from file name, create new column
    df.drop(['Metrics', 'Saver@50: '], axis=1, inplace=True)
    df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True) # drop 'Unnamed: 24' column containing only NaNs
    df.rename(columns = lambda x : str(x)[:-2], inplace=True) # remove colon and whitespace from all column names
    df.insert(0, 'model', "_".join(file.split('_')[2:-1]))
    results = results.append(df)

In [71]:
results

Unnamed: 0,model,MAP@1,MAP@5,MAP@10,MAP@20,Precision@1,Precision@5,Precision@10,Precision@20,Recall@1,...,HitRate@1,HitRate@5,HitRate@10,HitRate@20,MRR@1,MRR@5,MRR@10,MRR@20,Coverage@20,Popularity@20
0,ar,0.0,0.04251,0.048198,0.040999,0.673187,0.492854,0.380998,0.256228,0.049665,...,0.199468,0.442848,0.55662,0.640985,0.199468,0.290044,0.305256,0.311093,0.209408,0.254571
0,gru4rec_Reminder,0.0,0.030177,0.026259,0.019471,0.604391,0.34004,0.236327,0.150559,0.049141,...,0.242315,0.44684,0.514305,0.579242,0.242315,0.318858,0.328182,0.332805,0.645524,0.085802
0,gru4rec,0.0,0.016078,0.012596,0.009472,0.437126,0.197977,0.134185,0.08976,0.034864,...,0.180971,0.317764,0.385496,0.451364,0.180971,0.230659,0.239787,0.24444,0.957815,0.040294
0,hgru4rec,0.0,0.008647,0.006637,0.004828,0.288224,0.116327,0.078164,0.052402,0.023474,...,0.118563,0.193746,0.24684,0.304458,0.118563,0.145509,0.152504,0.156411,0.834901,0.018805
0,shan,0.0,0.033481,0.038984,0.034781,0.506853,0.426241,0.338736,0.231317,0.035195,...,0.084498,0.316168,0.443779,0.554624,0.084498,0.168973,0.185997,0.193778,0.021548,0.219715
0,sknn,0.0,0.041143,0.048763,0.042232,0.607319,0.48322,0.383606,0.258596,0.041398,...,0.151564,0.415037,0.534664,0.626347,0.151564,0.248337,0.264442,0.270941,0.23915,0.220649
0,sr_BR,0.0,0.043797,0.043158,0.036842,0.675449,0.47827,0.350752,0.241351,0.052668,...,0.253626,0.484897,0.55662,0.63686,0.253626,0.342191,0.351794,0.357297,0.467678,0.177414
0,sr,0.0,0.041139,0.044458,0.037711,0.647372,0.464857,0.356846,0.240852,0.05194,...,0.254824,0.495941,0.582701,0.664138,0.254824,0.344624,0.356168,0.361811,0.648558,0.230743
0,stan_ER,0.0,0.045085,0.047039,0.039358,0.659747,0.499774,0.373666,0.251457,0.04869,...,0.200798,0.452428,0.549568,0.627279,0.200798,0.293626,0.306638,0.312042,0.438847,0.180978
0,stan,0.0,0.03937,0.045066,0.041294,0.632069,0.46177,0.369182,0.257332,0.045875,...,0.190951,0.440585,0.553826,0.642582,0.190951,0.28222,0.297514,0.303878,0.384825,0.234746


### extracting predictions

In [72]:
# create app and user mappings
data = pd.read_csv('../../data/sequence-level/data_seq.csv')

mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data['category_list'])))])
mapping_reverse = dict((v,k) for k,v in mapping.items())

In [73]:
# extract ground truth from test data (test_data) for a single item (position) in a single session (sessionId)
def extract_ground_truth(ID, position, test_data):
    relevant_df = test_data[test_data[SESSION_KEY]==ID]
    index = relevant_df.index[position+1]
    ground_truth = relevant_df[ITEM_KEY][index]
    return ground_truth

In [74]:
predictions_files = [f for f in os.listdir(path_results) if 'Saver' in f]
predictions = pd.DataFrame()
for file in predictions_files:
    model = "_".join(file.split('_')[2:-1])
    df = pd.read_csv(path_results + file, sep = ';')
    if 'sessionID' not in predictions.columns:
        predictions['sessionID'] = df['SessionId']
    if 'position' not in predictions.columns:
        predictions['position'] = df['Position']
    if 'ground_truth' not in predictions.columns:
        predictions['ground_truth'] = predictions.apply(lambda x: extract_ground_truth(x['sessionID'], x['position'], test_data), axis=1)
        predictions['ground_truth_name'] = predictions['ground_truth'].apply(lambda x: mapping_reverse[x])
    predictions['recs-' + model] = df['Recommendations'].apply(lambda x: [int(i) for i in x.split(',')])
    predictions['recs_names-' + model] = predictions['recs-' + model].apply(lambda x: [mapping_reverse[i] for i in x])
    predictions['scores-' + model] = df['Scores'].apply(lambda x: x.split(','))

In [75]:
def print_predictions(predictions, sessionID, num_recs, models):
    # predictions must contain columns named 'sessionID' and 'position', containing the respective values
    predictions_dict = {}
    for pos in positions:
        row = predictions[(predictions.sessionID == sessionID) & (predictions.position == pos)]
        ground_truth = row.ground_truth_name.to_string(index=False)
#         print('sessionID: ' + str(sessionID) + ', position: ' + str(pos))
#         print('ground truth: ' + str(row.ground_truth_name.to_string(index=False)))
        df = pd.DataFrame()
        for model in models:
            df[model] = [row['recs_names-' + model].tolist()[0][i] for i in range(num_recs)]
        name = str(sessionID) + '_' + str(pos)
        predictions_dict[name] = (sessionID, pos, ground_truth, df)
    return predictions_dict

In [76]:
sessionID = predictions.sessionID.unique()[0]
num_recs = 5
positions = predictions.position[predictions.sessionID==sessionID]
models = results.model.tolist()

In [77]:
print_predictions(predictions, sessionID, num_recs, models)

{'27_0': (27,
  0,
  'ON,Launcher,Orientation,Launcher,Financial,Laun...',
                                                    ar  \
  0  ON,Internet_Browser,ON,Launcher,Internet_Brows...   
  1  ON,Launcher,Messaging,Launcher,TVVideo_Apps,La...   
  2  ON,Launcher,Social_Networks,Launcher,Settings,...   
  3  ON,Launcher,Orientation,Launcher,Financial,Lau...   
  4  ON,Launcher,ON,Messaging,Timer_Clocks,Messagin...   
  
                                      gru4rec_Reminder  \
  0  ON,Internet_Browser,ON,Launcher,Internet_Brows...   
  1  ON,Launcher,Messaging,Launcher,TVVideo_Apps,La...   
  2  ON,Music_Audio_Radio,PLAY,Launcher,Messaging,L...   
  3  ON,Launcher,ON,Messaging,Launcher,Social_Netwo...   
  4  ON,Music_Audio_Radio,PAUSE,PLAY,PAUSE,PLAY,Lau...   
  
                                               gru4rec  \
  0  ON,Music_Audio_Radio,PLAY,Launcher,Messaging,L...   
  1  ON,Launcher,ON,Messaging,Launcher,Social_Netwo...   
  2  ON,Financial,ON,Financial,Launcher,Messaging

In [78]:
# predictions.apply(lambda x: x['recs-sr_B'][0] == x['ground_truth'], axis=1).sum()/len(predictions)

### performance by position

We expect performance of neural algorithms to increase as the position increases. Since this is sequence-level analysis, there are plenty of long sentences.

In [79]:
algorithms = [i for i in predictions.columns if i.startswith('recs-')]

In [80]:
# helper function outputting whether ground truth is in recommendation list of length k for a single algorithm and item
def calc_hr_k(ground_truth, rec_list, k):
    return ground_truth in rec_list[:k]

In [81]:
pos_counts = predictions['position'].value_counts()

In [174]:
k=10

In [175]:
perf_by_pos = pd.DataFrame()
for pos in pos_counts.index:
    pred_pos = predictions[predictions['position']==pos]
    df = pd.DataFrame()
    df['position'] = [pos]
    count = pos_counts[pos]
    df['count'] = [count]
    for algo in algorithms:
        algo_name = ''.join(algo.split('-')[1:])
        value = pred_pos.apply(lambda x: calc_hr_k(x['ground_truth'], x[algo], k), axis=1).sum()/len(pred_pos)
        df[algo_name] = [value]
    perf_by_pos = perf_by_pos.append(df).reset_index(drop=True)

In [176]:
perf_by_pos.head(20)

Unnamed: 0,position,count,ar,gru4rec_Reminder,gru4rec,hgru4rec,shan,sknn,sr_BR,sr,stan_ER,stan,vsknn_EBR,vsknn,vstan_EBR,vstan
0,0,277,0.490975,0.418773,0.314079,0.223827,0.440433,0.519856,0.534296,0.509025,0.501805,0.480144,0.364621,0.129964,0.292419,0.487365
1,1,262,0.461832,0.370229,0.251908,0.183206,0.408397,0.450382,0.469466,0.469466,0.477099,0.473282,0.293893,0.137405,0.282443,0.480916
2,2,249,0.502008,0.437751,0.317269,0.188755,0.433735,0.53012,0.51004,0.53012,0.522088,0.538153,0.281124,0.176707,0.289157,0.554217
3,3,241,0.514523,0.448133,0.33195,0.19917,0.40249,0.502075,0.526971,0.510373,0.510373,0.506224,0.253112,0.149378,0.26971,0.510373
4,4,232,0.538793,0.413793,0.323276,0.206897,0.413793,0.491379,0.50431,0.551724,0.512931,0.50431,0.262931,0.150862,0.232759,0.525862
5,5,226,0.526549,0.438053,0.340708,0.185841,0.438053,0.539823,0.530973,0.522124,0.530973,0.530973,0.243363,0.154867,0.221239,0.535398
6,6,220,0.5,0.490909,0.322727,0.2,0.413636,0.513636,0.513636,0.545455,0.486364,0.504545,0.254545,0.154545,0.190909,0.518182
7,7,214,0.495327,0.476636,0.383178,0.280374,0.350467,0.481308,0.518692,0.556075,0.462617,0.5,0.228972,0.149533,0.224299,0.495327
8,8,209,0.502392,0.397129,0.315789,0.167464,0.411483,0.497608,0.497608,0.526316,0.507177,0.483254,0.263158,0.148325,0.229665,0.497608
9,9,203,0.541872,0.517241,0.384236,0.270936,0.389163,0.522167,0.53202,0.55665,0.536946,0.541872,0.280788,0.182266,0.275862,0.541872


In [181]:
# considering only positions <= vs. >
perf_by_pos = pd.DataFrame()
cutoff = 5
for pos in ['<=', '>']:
    if pos=='<=':
        pred_pos = predictions[predictions['position']<=cutoff]
    else:
        pred_pos = predictions[predictions['position']>cutoff]
    df = pd.DataFrame()
    df['position'] = [pos]
    for algo in algorithms:
        algo_name = ''.join(algo.split('-')[1:])
        value = pred_pos.apply(lambda x: calc_hr_k(x['ground_truth'], x[algo], k), axis=1).sum()/len(pred_pos)
        df[algo_name] = [value]
    perf_by_pos = perf_by_pos.append(df).reset_index(drop=True)

In [182]:
perf_by_pos

Unnamed: 0,position,ar,gru4rec_Reminder,gru4rec,hgru4rec,shan,sknn,sr_BR,sr,stan_ER,stan,vsknn_EBR,vsknn,vstan_EBR,vstan
0,<=,0.504371,0.420309,0.312038,0.198386,0.422999,0.505044,0.512441,0.514459,0.508406,0.504371,0.28581,0.149294,0.266308,0.514459
1,>,0.569509,0.537492,0.403616,0.258792,0.448905,0.541971,0.567518,0.599536,0.559721,0.566025,0.260783,0.151792,0.229098,0.576808
