# Libraries

In [15]:
import os, json
import pandas as pd
from lib import modelling_mf
import numpy as np

# Set up

In [2]:
algorithm = 'DMF'
batch_size = 1024

user_col = "user" # the name of the column that includes the users
item_col = "item" # the name of the column that includes the items
predict_col="rating" # the name of the column that includes the interaction

# Analysis

## A. Fairbook

In [3]:
# These change for every dataser
data = 'fairbook'
ratings = pd.read_csv("data/"+data+"/"+data+"_events.csv")
all_items=set(ratings.item.unique())

In [16]:
final_recs_files = {}
for fold in range(1,6):


    # Set locations of recommendations and best iteration files
    recs_location = 'results/'+data+str(fold)+'/recs/'
    best_iter_location = 'results/'+data+str(fold)+'/performance/'
    
    # Find all json files in the folder (they contain the best iterations).
    bestmodelparams_files = [best_iter_location + pos_json for pos_json in os.listdir(best_iter_location) if pos_json.endswith('.json')]

    # For every json find the mlp value and the equivalent best iteration and learning rate (which was set after hyperparameter tuning).
    best_iters = {}
    for file in bestmodelparams_files:
        with open(file) as f:
            d = json.load(f)
        mlp = d[1]['configuration']['item_mlp']
        best_iteration = d[1]['configuration']['best_iteration']
        lr = d[1]['configuration']['lr'] # this was found in a previous step, not during training
        best_iters[mlp] = (lr, best_iteration)
        
    # For every mlp value, find the file that contains the recommendations.
    mlp_values = best_iters.keys()
    
    for mlp in mlp_values:
        lr = str(best_iters[mlp][0]).replace('.','$')
        bi = str(best_iters[mlp][1])
        mlp = mlp.replace(',','-').replace('(','').replace(')','')
        recs_file = recs_location + algorithm+'_seed=42_e=25_bs='+str(batch_size)+'_lr='+lr+'_umlp='+mlp+'_imlp='+mlp+'_negratio=5_reg=0$001_sim=cosine_it='+bi+'.tsv'
        final_recs_files[fold,mlp] = recs_file
    mlp_values = np.unique([x[1] for x in final_recs_files.keys()]) # change the format   

In [22]:
for mlp in mlp_values:
    print(mlp)
    metrics = []
    for fold in range(1,6):
        # Train and test files of that fold
        train_file = 'data/'+data+'_fold_'+str(fold)+'_train.csv'
        test_file = 'data/'+data+'_fold_'+str(fold)+'_test.csv'
    
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)
        test_users = test_df.user.unique()

        # Recommendations
        recs_file = final_recs_files[fold,mlp]
        recs_df = pd.read_csv(recs_file, sep='\t', header=None)
        recs_df.columns = ['user','item','rating']

        # Test recommendations
        test_recs = recs_df[recs_df.user.isin(test_users)].reset_index(drop=True)
        test_recs_grouped = test_recs.groupby([user_col])[item_col].apply(list)

        # Calculate all metrics
        pop_bias= modelling_mf.calculate_pop_bias_per_item(all_items, item_col, user_col, predict_col, train_df, recs=test_recs)
        GAP_vs_GAP = modelling_mf.calculate_ave_pop_per_user(test_users, item_col, user_col, pop_bias, train_df, test_recs_grouped)
        pop_corr = modelling_mf.calculate_pop_correlation(pop_bias)
        precision, recall, ndcg = modelling_mf.calculate_topn_metrics(test_recs,test_df)
        AggDiv = modelling_mf.evaluate_item_coverage(pop_bias["recommendation"].values)
        ARP, ave_PL, ACLT = modelling_mf.calculate_all_pb_metrics(pop_bias, test_users, item_col, user_col, train_df, test_recs_grouped, test_recs)
        metrics_dict = {"pop_corr":pop_corr, "RMSE":0, 'NDCG':ndcg,"ARP":ARP, "ave_PL": ave_PL, "ACLT": ACLT, "AggDiv": AggDiv}
        metrics.append(metrics_dict) # per combination of mlp-bs, and per fold
        pop_biases = [pop_bias]
        break
    break

64-32
nr of longtail 7104
5.584905660377358


In [24]:
metrics_dict

{'pop_corr': (0.002419312870633217, 0.8405165908437839),
 'RMSE': 0,
 'NDCG': 0.001639807446294823,
 'ARP': 0.002071079667867551,
 'ave_PL': -42.53959701238517,
 'ACLT': 5.584905660377358,
 'AggDiv': 0.009247218610027452}

In [9]:
for mlp in mlp_values:
    metrics = []
    print(mlp)
    for fold in range(1,6):
        
        recs_file = final_recs_files[]
        train_file = data_location+data+'_fold_'+str(fold)+'_train.csv'
        test_file = data_location+data+'_fold_'+str(fold)+'_test.csv'
        # print(recs_file)
        recs_df = pd.read_csv(recs_file, sep='\t', header=None)
        recs_df.columns = ['user','item','rating']
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)
        test_users = test_df.user.unique()
        test_recs = recs_df[recs_df.user.isin(test_users)].reset_index(drop=True)
        test_recs_grouped = test_recs.groupby([user_col])[item_col].apply(list)
        
        pop_bias= modelling_mf.calculate_pop_bias_per_item(all_items, item_col, user_col, predict_col, train_df, recs=test_recs)
        GAP_vs_GAP = modelling_mf.calculate_ave_pop_per_user(test_users, item_col, user_col, pop_bias, train_df, test_recs_grouped)
        pop_corr = modelling_mf.calculate_pop_correlation(pop_bias)
        precision, recall, ndcg = modelling_mf.calculate_topn_metrics(test_recs,test_df)
        AggDiv = modelling_mf.evaluate_item_coverage(pop_bias["recommendation"].values)
        ARP, ave_PL, ACLT = modelling_mf.calculate_all_pb_metrics(pop_bias, test_users, item_col, user_col, train_df, test_recs_grouped, test_recs)
        metrics_dict = {"pop_corr":pop_corr, "RMSE":0, 'NDCG':ndcg,"ARP":ARP, "ave_PL": ave_PL, "ACLT": ACLT, "AggDiv": AggDiv}
        metrics.append(metrics_dict) # per combination of mlp-bs, and per fold
        pop_biases = [pop_bias]
        
        modelling_mf.plot_results(pop_biases.copy(), 
                 GAP_vs_GAP.copy(), algorithm,
                 0, 
                 precision, 
                 recall,
                 ndcg,
                 0,
                 0,
                 cv=False, 
                 n=10, 
                 args='fold'+str(fold), data_strategy=data, save_plot=False)
        
    full_metrics_dict[mlp] = metrics
    print(full_metrics_dict)

64-64


NameError: name 'data_location' is not defined