### setup

In [1]:
import datetime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import pandas as pd
import pyreadr
import pickle
import re
import os

In [2]:
os.chdir('C:\\Users\\Simon\\Desktop\\MA\\session-rec')

In [3]:
# datatypes = ['app-level', 'seq-level']
datatype = 'sequence-level'
windows = [1,2,3,4,5]
model_index = [0, 3, 8, 9, 10, 4, 1, 2, 5, 6, 7]
model_name = 'Algorithm'

In [4]:
USER_KEY = 'userID'
TIME_KEY = 'timestamp'
if datatype == 'app-level':    
    ITEM_KEY = 'appID'
    SESSION_KEY = 'sessionID'
else:
    ITEM_KEY = 'usID'
    SESSION_KEY = 'sentenceID'

### helper functions

In [5]:
# for multiple windows (incl. min20)
# get average performance across all windows for a given algorithm
def get_av_perf(files, key):
    res = pd.DataFrame()
    for file in files:
        window = file.strip('.csv').split('_')[-1]
        df = pd.read_csv(folder_res + file, sep = ';')
        df.drop(['Metrics', 'Saver@50: '], axis=1, inplace=True)
        df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True) # drop 'Unnamed: 24' column containing only NaNs
        df.rename(columns = lambda x : str(x)[:-2], inplace=True) # remove colon and whitespace from all column names
        df.insert(0, model_name, key)
        df.insert(1, 'window', window)
        res = res.append(df)
    res = res.groupby(model_name).mean().reset_index(level=0)
    return(res)

In [6]:
# for single window
# get performance for a given algorithm
def get_perf(file, key):
    df = pd.read_csv(folder_res + file, sep = ';')
    df.drop(['Metrics', 'Saver@50: '], axis=1, inplace=True)
    df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True) # drop 'Unnamed: 24' column containing only NaNs
    df.rename(columns = lambda x : str(x)[:-2], inplace=True) # remove colon and whitespace from all column names
    df.insert(0, model_name, key)
    return(df)

In [7]:
# extract ground truth from test data (test_data) for a single item (position) in a single session (sessionId)
def extract_ground_truth(ID, position, test_data):
    relevant_df = test_data[test_data[SESSION_KEY]==ID]
    index = relevant_df.index[position+1]
    ground_truth = relevant_df[ITEM_KEY][index]
    return ground_truth

In [8]:
# generate a df containing the ground truth as well as predictions for all available algorithms
def generate_predictions(predictions_files, test_data, mapping_id2name, multiple=True):
    predictions = pd.DataFrame()
    for file in predictions_files:
        if multiple:
            model = "_".join(file.split('_')[2:-2])
        else:
            model = "_".join(file.split('_')[2:-1])
        df = pd.read_csv(folder_res + file, sep = ';')
        if 'sessionID' not in predictions.columns:
            predictions['sessionID'] = df['SessionId']
        if 'position' not in predictions.columns:
            predictions['position'] = df['Position']
        if 'ground_truth' not in predictions.columns:
            predictions['ground_truth'] = predictions.apply(lambda x: extract_ground_truth(x['sessionID'], x['position'], test_data), axis=1)
            predictions['ground_truth_name'] = predictions['ground_truth'].apply(lambda x: mapping_reverse[x])
        predictions['recs-' + model] = df['Recommendations'].apply(lambda x: [int(i) for i in x.split(',')])
        predictions['recs_names-' + model] = predictions['recs-' + model].apply(lambda x: [mapping_reverse[i] for i in x])
        predictions['scores-' + model] = df['Scores'].apply(lambda x: x.split(','))
    return predictions

In [9]:
# helper function outputting whether ground truth is in recommendation list of length k for a single algorithm and item
def calc_hr_k(ground_truth, rec_list, k):
    return ground_truth in rec_list[:k]

In [10]:
# helper function for calculating the MRR
def calc_mrr_k(ground_truth, rec_list, k):
    if ground_truth not in rec_list[:k]:
        return 0
    else:
        score = rec_list.index(ground_truth) + 1
        return 1/score

In [11]:
def capitalize_names(df):
    name_dict = {
        'ar': 'AR',
        'ct-pre': 'CT',
        'ctpre': 'CT',
        'gru4rec': 'GRU4Rec',
        'gru4rec_Reminder': 'GRU4Rec_R',
        'hgru4rec': 'HGRU4Rec',
        'shan': 'SHAN',
        'sknn': 'SKNN',
        'sr': 'SR',
        'sr_BR': 'SR_BR',
        'stan': 'STAN',
        'vsknn': 'VSKNN',
        'vsknn_EBR': 'VSKNN_EBR',
        'vstan': 'VSTAN',
        'vstan_EBR': 'VSTAN_EBR'
    }
    df[model_name] = df[model_name].apply(lambda x: name_dict[x])
    return df

### multiple windows

##### overall

In [23]:
folder_res = 'results/testing/' + str(datatype) + '/multiple/'
algos = set([f.split('_window')[0].split('test_single_')[1] for f in os.listdir(folder_res)])
results_seq_multiple = pd.DataFrame()
for key in algos:
    files = [f for f in os.listdir(folder_res) 
             if ('Saver' not in f) 
             and (f.startswith('test_single_' + str(key) + '_window'))
             and ('min20' not in f)]
    res = get_av_perf(files, key)
    results_seq_multiple = results_seq_multiple.append(res)
results_seq_multiple = capitalize_names(results_seq_multiple)
results_seq_multiple = results_seq_multiple.round(4)
results_seq_multiple = results_seq_multiple.sort_values(model_name)
results_seq_multiple['model_index'] = model_index
results_seq_multiple = results_seq_multiple.set_index('model_index').sort_index()
results_seq_multiple.index.name = None

In [24]:
results_seq_multiple

Unnamed: 0,Algorithm,HitRate@1,HitRate@5,HitRate@10,HitRate@20,MRR@5,MRR@10,MRR@20,Coverage@20,Popularity@20
0,AR,0.5295,0.6875,0.7541,0.8126,0.5891,0.598,0.6021,0.4658,0.0733
1,SR,0.5305,0.7042,0.7653,0.8187,0.5981,0.6063,0.6101,0.8772,0.0724
2,SR_BR,0.5307,0.6669,0.7271,0.7931,0.583,0.5911,0.5956,0.6703,0.0625
3,CT,0.5307,0.7161,0.7705,0.8238,0.604,0.6113,0.615,0.9467,0.073
4,SKNN,0.529,0.6917,0.7564,0.8149,0.5884,0.597,0.6011,0.1807,0.0727
5,STAN,0.5277,0.6888,0.7391,0.7685,0.5879,0.5948,0.5969,0.5901,0.0666
6,VSTAN,0.5274,0.6831,0.7284,0.7415,0.5844,0.5907,0.5917,0.324,0.0578
7,VSTAN_EBR,0.5301,0.6686,0.7225,0.7864,0.583,0.5901,0.5946,0.6563,0.062
8,GRU4Rec,0.5291,0.5298,0.5306,0.5317,0.5293,0.5294,0.5295,0.0818,0.0502
9,GRU4Rec_R,0.4982,0.6738,0.7381,0.7996,0.5666,0.5753,0.5796,0.5622,0.0661


In [25]:
with open('../MA/tables/results_seq_multiple.tex','w') as tf:
    tf.write(results_seq_multiple.to_latex(index=False))

In [26]:
with open('../MA/results/sequence-level/results_seq_multiple.pickle', 'wb') as handle:
    pickle.dump(results_seq_multiple, handle)

### single window

##### overall

In [27]:
folder_res = 'results/testing/' + str(datatype) + '/single/'
algos = set([f.split('test_single_')[1].split('_single')[0] for f in os.listdir(folder_res) if f.startswith('test_single')])
results_seq_single = pd.DataFrame()
for key in algos:
    file = [f for f in os.listdir(folder_res) 
             if ('Saver' not in f) 
             and (f.startswith('test_single_' + str(key) + '_single'))
             and ('min20' not in f)
             and ('embedding' not in f)][0] # list is of length 1 actually
    res = get_perf(file, key)
    results_seq_single = results_seq_single.append(res)
results_seq_single = capitalize_names(results_seq_single)
results_seq_single = results_seq_single.round(4)
results_seq_single = results_seq_single.sort_values(model_name)
results_seq_single['model_index'] = model_index
results_seq_single = results_seq_single.set_index('model_index').sort_index()
results_seq_single.index.name = None

In [28]:
results_seq_single

Unnamed: 0,Algorithm,HitRate@1,HitRate@5,HitRate@10,HitRate@20,MRR@5,MRR@10,MRR@20,Coverage@20,Popularity@20
0,AR,0.5189,0.6781,0.742,0.7988,0.5796,0.5882,0.5921,0.2237,0.0733
1,SR,0.5188,0.6933,0.754,0.8058,0.5878,0.5959,0.5995,0.6626,0.0724
2,SR_BR,0.5202,0.6533,0.7055,0.7725,0.5713,0.5783,0.5829,0.5253,0.0618
3,CT,0.5203,0.7083,0.763,0.8124,0.5952,0.6026,0.6061,0.8867,0.0725
4,SKNN,0.5188,0.6761,0.7388,0.7907,0.5741,0.5825,0.5861,0.099,0.0722
5,STAN,0.5143,0.6613,0.7121,0.7348,0.568,0.575,0.5767,0.4185,0.0648
6,VSTAN,0.5143,0.6558,0.7025,0.7131,0.5666,0.573,0.5739,0.1842,0.0567
7,VSTAN_EBR,0.5177,0.6465,0.695,0.7601,0.568,0.5744,0.5789,0.5428,0.0614
8,GRU4Rec,0.5183,0.519,0.5193,0.5197,0.5186,0.5187,0.5187,0.0432,0.0502
9,GRU4Rec_R,0.4918,0.6603,0.7198,0.782,0.5549,0.5628,0.5671,0.4723,0.0636


In [29]:
with open('../MA/tables/results_seq_single.tex','w') as tf:
    tf.write(results_seq_single.to_latex(index=False))

In [30]:
with open('../MA/results/sequence-level/results_seq_single.pickle', 'wb') as handle:
    pickle.dump(results_seq_single, handle)

### removing on and off (unspecific tuning)

##### multiple windows

In [31]:
folder_res = 'results/testing_onoff_unspecific_tuning/' + str(datatype) + '/multiple/'
algos = set([f.split('_window')[0].split('test_single_')[1] for f in os.listdir(folder_res)])
results_seq_multiple_droponoff = pd.DataFrame()
for key in algos:
    files = [f for f in os.listdir(folder_res) 
             if ('Saver' not in f) 
             and (f.startswith('test_single_' + str(key) + '_window'))
             and ('min20' not in f)]
    res = get_av_perf(files, key)
    results_seq_multiple_droponoff = results_seq_multiple_droponoff.append(res)
results_seq_multiple_droponoff = capitalize_names(results_seq_multiple_droponoff)
results_seq_multiple_droponoff = results_seq_multiple_droponoff.round(4)
results_seq_multiple_droponoff = results_seq_multiple_droponoff.sort_values(model_name)
results_seq_multiple_droponoff['model_index'] = model_index
results_seq_multiple_droponoff = results_seq_multiple_droponoff.set_index('model_index').sort_index()
results_seq_multiple_droponoff.index.name = None

In [32]:
results_seq_multiple_droponoff

Unnamed: 0,Algorithm,HitRate@1,HitRate@5,HitRate@10,HitRate@20,MRR@5,MRR@10,MRR@20,Coverage@20,Popularity@20
0,AR,0.1993,0.4685,0.5804,0.6734,0.2995,0.3145,0.3209,0.4906,0.2551
1,SR,0.2496,0.5017,0.6013,0.6874,0.3442,0.3578,0.3638,0.8554,0.2437
2,SR_BR,0.2367,0.4914,0.5785,0.6664,0.3342,0.346,0.352,0.706,0.1845
3,CT,0.2539,0.5006,0.5961,0.6838,0.3462,0.3591,0.3652,0.9128,0.2527
4,SKNN,0.1638,0.4348,0.5543,0.6541,0.2628,0.2789,0.2859,0.2571,0.2552
5,STAN,0.2148,0.4749,0.5833,0.6565,0.3098,0.3245,0.3297,0.834,0.2175
6,VSTAN,0.1985,0.4391,0.5061,0.5311,0.2876,0.297,0.2988,0.5103,0.1165
7,VSTAN_EBR,0.1941,0.442,0.5331,0.6295,0.2858,0.2982,0.3048,0.6804,0.1699
8,GRU4Rec,0.137,0.19,0.1956,0.2024,0.1603,0.161,0.1615,0.3598,0.0877
9,GRU4Rec_R,0.1843,0.4258,0.5273,0.6366,0.2738,0.2874,0.295,0.6627,0.1649


In [33]:
with open('../MA/tables/results_seq_multiple_droponoff.tex','w') as tf:
    tf.write(results_seq_multiple_droponoff.to_latex(index=False))

In [34]:
with open('../MA/results/sequence-level/results_seq_multiple_droponoff.pickle', 'wb') as handle:
    pickle.dump(results_seq_multiple_droponoff, handle)

In [35]:
# # performance drop vis-à-vis results_seq_multiple
# 1 - (results_seq_multiple_droponoff['HitRate@1'] / results_seq_multiple['HitRate@1']).mean()

### performance by position

##### create mapping dicts

In [13]:
folder_res = 'results/testing_onoff_unspecific_tuning/' + str(datatype) + '/multiple/'
folder_data = 'data/testing_onoff/' + str(datatype) + '/multiple/'
data = pd.read_csv('../data/sequence-level/data_seq.csv') # create app and user mappings
mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data['category_list'])))])
mapping_reverse = dict((v,k) for k,v in mapping.items())

##### individual positions

In [14]:
k = 1 # HR@k

In [15]:
results_seq_multiple_pos = pd.DataFrame()

for window in windows:
    test_data = pd.read_hdf(str(folder_data) + 'window_' + str(window) + '.hdf', 'test') 
    predictions_files = [f for f in os.listdir(folder_res) if ('min20' not in f) 
                         and f.endswith('window_' + str(window) + '-Saver@50.csv')]
    predictions = generate_predictions(predictions_files, test_data, mapping_reverse)
    algorithms = [i for i in predictions.columns if i.startswith('recs-')]

    perf_by_pos = pd.DataFrame()
    positions = range(1,11)
    for pos in positions:
        pred_pos = predictions[predictions['position']==pos-1]
        df = pd.DataFrame()
        df['position'] = ['position = ' + str(pos)]
        df['window'] = [window]
        for algo in algorithms:
            algo_name = ''.join(algo.split('-')[1:])
            value = pred_pos.apply(lambda x: calc_hr_k(x['ground_truth'], x[algo], k), axis=1).sum()/len(pred_pos)
            df[algo_name] = [value]
        perf_by_pos = perf_by_pos.append(df).reset_index(drop=True)
    results_seq_multiple_pos = results_seq_multiple_pos.append(perf_by_pos)

results_seq_multiple_pos = results_seq_multiple_pos.groupby('position').mean() # average across positions
results_seq_multiple_pos.drop(['window'], axis=1, inplace=True)
results_seq_multiple_pos = results_seq_multiple_pos.transpose() # transpose to have algorithms as rows, positions as columns
columns_reordered = results_seq_multiple_pos.columns.tolist()
columns_reordered.sort(key=lambda x: int(re.search(r'\d+$',x).group()))
results_seq_multiple_pos = results_seq_multiple_pos[columns_reordered]
results_seq_multiple_pos.reset_index(inplace=True) # convert index to column named "index"
results_seq_multiple_pos.rename(columns={'index': model_name}, inplace=True) # rename column "index" to "model"
results_seq_multiple_pos.rename_axis(None, axis=1, inplace=True) # unname new index
results_seq_multiple_pos = capitalize_names(results_seq_multiple_pos) # adjust model names
results_seq_multiple_pos = results_seq_multiple_pos.round(4)
results_seq_multiple_pos = results_seq_multiple_pos.sort_values(model_name)
results_seq_multiple_pos['model_index'] = model_index
results_seq_multiple_pos = results_seq_multiple_pos.set_index('model_index').sort_index()
results_seq_multiple_pos.index.name = None

In [16]:
results_seq_multiple_pos

Unnamed: 0,Algorithm,position = 1,position = 2,position = 3,position = 4,position = 5,position = 6,position = 7,position = 8,position = 9,position = 10
0,AR,0.135,0.1612,0.1662,0.1629,0.1763,0.1881,0.1991,0.1944,0.1756,0.16
1,SR,0.2126,0.2261,0.2306,0.2253,0.2145,0.2309,0.2359,0.2492,0.2171,0.2321
2,SR_BR,0.2017,0.2044,0.2061,0.2146,0.2153,0.2274,0.2256,0.219,0.2145,0.2062
3,CT,0.217,0.239,0.2423,0.2191,0.2266,0.2241,0.2329,0.2509,0.2337,0.2324
4,SKNN,0.1595,0.2004,0.1807,0.1496,0.1722,0.1697,0.1679,0.1491,0.151,0.1403
5,STAN,0.1583,0.1911,0.1863,0.1783,0.1915,0.207,0.2156,0.2144,0.212,0.1915
6,VSTAN,0.1583,0.1955,0.1869,0.1684,0.1835,0.1923,0.2031,0.2031,0.1906,0.1694
7,VSTAN_EBR,0.1413,0.1398,0.1649,0.162,0.193,0.1905,0.201,0.1649,0.1901,0.1642
8,GRU4Rec,0.0963,0.113,0.1168,0.1077,0.1072,0.1117,0.1227,0.1159,0.1247,0.1128
9,GRU4Rec_R,0.143,0.1372,0.1277,0.1273,0.1539,0.1642,0.1747,0.1621,0.1627,0.1482


In [60]:
with open('../MA/tables/results_seq_multiple_pos_HR@' + str(k) + '.tex','w') as tf:
    tf.write(results_seq_multiple_pos.to_latex(index=False))

In [61]:
with open('../MA/results/sequence-level/results_seq_multiple_pos_HR@' + str(k) + '.pickle', 'wb') as handle:
    pickle.dump(results_seq_multiple_pos, handle)

##### cutoffs

In [17]:
cutoffs = [2, 5, 10]
k = 1 # HR@k

In [18]:
results_seq_multiple_cutoff = pd.DataFrame()

for cutoff in cutoffs:
    for window in windows:
        test_data = pd.read_hdf(str(folder_data) + 'window_' + str(window) + '.hdf', 'test') 
        predictions_files = [f for f in os.listdir(folder_res) if ('min20' not in f) 
                             and f.endswith('window_' + str(window) + '-Saver@50.csv')]
        predictions = generate_predictions(predictions_files, test_data, mapping_reverse)
        algorithms = [i for i in predictions.columns if i.startswith('recs-')]

        # for  single cutoff and single window, create 'performance-by-position' df containing two rows and |algorithms| columns
        perf_by_pos = pd.DataFrame()
        positions = ['position <= ' + str(cutoff), 'position > ' + str(cutoff)]
        for pos in positions:
            if pos==('position <= ' + str(cutoff)):
                pred_pos = predictions[predictions['position']<=cutoff-1] # -1 b/c the first position has index 0
            else:
                pred_pos = predictions[predictions['position']>cutoff-1]
            df = pd.DataFrame()
            df['position'] = [pos]
            df['window'] = [window]
            for algo in algorithms:
                algo_name = ''.join(algo.split('-')[1:])
                value = pred_pos.apply(lambda x: calc_hr_k(x['ground_truth'], x[algo], k), axis=1).sum()/len(pred_pos)
                df[algo_name] = [value]
            perf_by_pos = perf_by_pos.append(df).reset_index(drop=True)
        results_seq_multiple_cutoff = results_seq_multiple_cutoff.append(perf_by_pos)

results_seq_multiple_cutoff = results_seq_multiple_cutoff.groupby('position').mean() # average across positions (e.g., "<= 2", "> 10")
results_seq_multiple_cutoff.drop(['window'], axis=1, inplace=True)
results_seq_multiple_cutoff = results_seq_multiple_cutoff.transpose() # transpose to have algorithms as rows, positions as columns
columns_reordered = results_seq_multiple_cutoff.columns.tolist()
columns_reordered.sort(key=lambda x: int(re.search(r'\d+$',x).group()))
results_seq_multiple_cutoff = results_seq_multiple_cutoff[columns_reordered]
results_seq_multiple_cutoff.reset_index(inplace=True) # convert index to column named "index"
results_seq_multiple_cutoff.rename(columns={'index': model_name}, inplace=True) # rename column "index" to "model"
results_seq_multiple_cutoff.rename_axis(None, axis=1, inplace=True) # unname new index
results_seq_multiple_cutoff = capitalize_names(results_seq_multiple_cutoff) # adjust model names
results_seq_multiple_cutoff = results_seq_multiple_cutoff.round(4)
results_seq_multiple_cutoff = results_seq_multiple_cutoff.sort_values(model_name)
results_seq_multiple_cutoff['model_index'] = model_index
results_seq_multiple_cutoff = results_seq_multiple_cutoff.set_index('model_index').sort_index()
results_seq_multiple_cutoff.index.name = None

In [19]:
results_seq_multiple_cutoff

Unnamed: 0,Algorithm,position <= 2,position > 2,position <= 5,position > 5,position <= 10,position > 10
0,AR,0.1478,0.2024,0.1598,0.2057,0.1711,0.2096
1,SR,0.2193,0.2515,0.2218,0.2543,0.227,0.2581
2,SR_BR,0.203,0.2387,0.2083,0.2413,0.2132,0.2454
3,CT,0.2278,0.2556,0.2288,0.2583,0.2314,0.2622
4,SKNN,0.1798,0.1629,0.1725,0.1625,0.1645,0.1635
5,STAN,0.1746,0.2174,0.1808,0.2206,0.1936,0.2226
6,VSTAN,0.1768,0.1999,0.1783,0.2019,0.1846,0.2037
7,VSTAN_EBR,0.1406,0.1972,0.1595,0.1991,0.1704,0.2019
8,GRU4Rec,0.1045,0.1389,0.1081,0.1416,0.1127,0.1462
9,GRU4Rec_R,0.1402,0.187,0.1377,0.1916,0.1494,0.1963


In [20]:
with open('../MA/tables/results_seq_multiple_cutoff_HR@' + str(k) + '.tex','w') as tf:
    tf.write(results_seq_multiple_cutoff.to_latex(index=False))

In [21]:
with open('../MA/results/sequence-level/results_seq_multiple_cutoff_HR@' + str(k) + '.pickle', 'wb') as handle:
    pickle.dump(results_seq_multiple_cutoff, handle)

### impact of ONOFF-removal

##### original data: which percentage of all top 1 predictions are ONOFF tokens (here: sum across all 5 windows)

In [47]:
folder_res = 'results/testing/' + str(datatype) + '/multiple/'
folder_data = folder_res.replace('results', 'data')
data = pd.read_csv('../data/sequence-level/data_seq.csv') # create app and user mappings
mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data['category_list'])))])
mapping_reverse = dict((v,k) for k,v in mapping.items())

In [48]:
k = 1 # HR@k
ONOFF = mapping['ON,OFF']

In [49]:
results_seq_multiple_onoff_perc = pd.DataFrame()

for window in windows:
    test_data = pd.read_hdf(str(folder_data) + 'window_' + str(window) + '.hdf', 'test') 
    predictions_files = [f for f in os.listdir(folder_res) if ('min20' not in f) 
                         and f.endswith('window_' + str(window) + '-Saver@50.csv')]
    predictions = generate_predictions(predictions_files, test_data, mapping_reverse)
    algorithms = [i for i in predictions.columns if i.startswith('recs-')]

    df = pd.DataFrame()
    df['window'] = [window]
    df['num_preds'] = [len(predictions)]
    for algo in algorithms:
        algo_name = ''.join(algo.split('-')[1:])
        value = predictions.apply(lambda x: calc_hr_k(ONOFF, x[algo], k), axis=1).sum()  
        df[algo_name] = [value]
    
    results_seq_multiple_onoff_perc = results_seq_multiple_onoff_perc.append(df)

results_seq_multiple_onoff_perc = pd.DataFrame(results_seq_multiple_onoff_perc.sum())
results_seq_multiple_onoff_perc = results_seq_multiple_onoff_perc.transpose()
num_preds = results_seq_multiple_onoff_perc['num_preds'][0]
results_seq_multiple_onoff_perc.drop(['window', 'num_preds'], axis=1, inplace=True)
results_seq_multiple_onoff_perc = results_seq_multiple_onoff_perc.div(num_preds)

In [50]:
results_seq_multiple_onoff_perc

Unnamed: 0,ar,ctpre,gru4rec_Reminder,gru4rec,hgru4rec,sknn,sr_BR,sr,stan,vstan_EBR,vstan
0,0.996973,0.995814,0.892723,1.0,0.958193,0.994621,0.994171,0.992995,0.972031,0.980956,0.972221


In [51]:
with open('../MA/tables/results_seq_multiple_onoff_perc' + str(k) + '.tex','w') as tf:
    tf.write(results_seq_multiple_onoff_perc.to_latex(index=False))

In [52]:
with open('../MA/results/sequence-level/results_seq_multiple_onoff_perc' + str(k) + '.pickle', 'wb') as handle:
    pickle.dump(results_seq_multiple_onoff_perc, handle)

##### original data: performance when excluding ONOFF from test sequences

In [53]:
results_seq_multiple_non_onoff_perf = pd.DataFrame()

for window in windows:
    test_data = pd.read_hdf(str(folder_data) + 'window_' + str(window) + '.hdf', 'test') 
    predictions_files = [f for f in os.listdir(folder_res) if ('min20' not in f) 
                         and f.endswith('window_' + str(window) + '-Saver@50.csv')]
    predictions = generate_predictions(predictions_files, test_data, mapping_reverse)
    predictions = predictions[predictions['ground_truth'] != ONOFF]
    algorithms = [i for i in predictions.columns if i.startswith('recs-')]

    df = pd.DataFrame()
    df['window'] = [window]
    for algo in algorithms:
        algo_name = ''.join(algo.split('-')[1:])
        value = predictions.apply(lambda x: calc_hr_k(x['ground_truth'], x[algo], k), axis=1).sum()/len(predictions)
        df[algo_name] = [value]
    
    results_seq_multiple_non_onoff_perf = results_seq_multiple_non_onoff_perf.append(df)

results_seq_multiple_non_onoff_perf = pd.DataFrame(results_seq_multiple_non_onoff_perf.mean())
results_seq_multiple_non_onoff_perf = results_seq_multiple_non_onoff_perf.transpose()
results_seq_multiple_non_onoff_perf.drop(['window'], axis=1, inplace=True)
results_seq_multiple_non_onoff_perf = results_seq_multiple_non_onoff_perf.div(num_preds)

In [54]:
results_seq_multiple_non_onoff_perf

Unnamed: 0,ar,ctpre,gru4rec_Reminder,gru4rec,hgru4rec,sknn,sr_BR,sr,stan,vstan_EBR,vstan
0,2.61912e-08,8.339163e-08,4.126154e-07,0.0,1.514179e-08,5.23321e-08,9.105607e-08,1.100328e-07,2.378267e-07,1.540669e-07,2.280082e-07


In [55]:
with open('../MA/tables/results_seq_multiple_non_onoff_perf' + str(k) + '.tex','w') as tf:
    tf.write(results_seq_multiple_non_onoff_perf.to_latex(index=False))

In [56]:
with open('../MA/results/sequence-level/results_seq_multiple_non_onoff_perf' + str(k) + '.pickle', 'wb') as handle:
    pickle.dump(results_seq_multiple_non_onoff_perf, handle)