# Libraries

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt
import random as rd
import pandas as pd
import numpy as np
import pickle as pkl
from tqdm import tqdm
import os
import time
from collections import defaultdict
from scipy import stats
from itertools import permutations, product
from numpy.linalg import norm
import seaborn as sns
sns.set(style="darkgrid")
pd.set_option("display.precision", 4)

# Summary

#### Evaluation strategies
We have identified three evaluation strategies:
1. For every user in the **train set** rank **all items**. | Book paper, *eva_one*
2. For every user in the **test set** rank **only the items this user has rated in the test set**. | Music paper, *eva_two*
3. For every user in the **test set** rank **only the items this user has NOT rated in the train set**. | Movie(?) paper, *eva_three*

#### Algorithms

In [None]:
algo_names = ["UserKNN",
              "ItemKNN", 
              "UserKNN with means", 
              "BPR",
              "MF",
              "PMF",
              "NMF",
              "WMF",
              "HPF",
              "NeuMF",
              "VAECF"
              ] # in the order that they are in the model list

# Import results

### Additions

In [None]:
data = ["movies", "music","books"]
evaluation = ["eva_one", "eva_two", "eva_three"]
pop_notion = ["pop_one", "pop_two", "pop_three"]

In [None]:
aspects = [data, evaluation, pop_notion]

In [None]:
some_aspects = [data, evaluation]

In [None]:
results_location = "data/results/"

In [None]:
# pkl.load(open(results_location+"df_item_dist_cornac_"+data[2]+"_"+evaluation[0]+".csv","rb"))

In [None]:
c = pd.read_csv(results_location+"df_item_dist_cornac_"+data[2]+"_"+evaluation[0]+".csv", index_col=0)

In [None]:
c2 = pd.read_csv(results_location+"df_item_dist_cornac_"+data[2]+"_"+evaluation[1]+".csv", index_col=0)

In [None]:
stats.pearsonr(c2["count"], c2.HPF)

In [None]:
stats.pearsonr(c2["count"], c2.NMF)

In [None]:
stats.pearsonr(c["count"], c.NMF)

### Define aspect combinations

In [None]:
all_combinations = list(product(*aspects))
all_combinations = [x for x in all_combinations if not ("movies" in x)&("pop_three" in x)]
all_combinations = [x for x in all_combinations if not ("books" in x)&("pop_three" in x)]
# remove combinations of "pop_three" that dont include music.
some_combinations = list(product(*some_aspects))

In [None]:
sum_exists = 0
for combi in some_combinations:
    path = results_location+"df_item_dist_cornac_"+combi[0]+"_"+combi[1]+".csv"
    exists = os.path.exists(path)
    print(combi, exists)
    sum_exists+=exists
print(sum_exists, "out of", len(some_combinations))

In [None]:
sum_exists = 0

for combi in all_combinations:
    path = results_location+"low_gap_vals_"+combi[0]+"_"+combi[1]+"_"+combi[2]+".pickle"
    exists = os.path.exists(path)
    sum_exists+=exists
    print(combi, "|",exists)
print(sum_exists, "out of", len(all_combinations))

# First RQ

## Calculate correlations

### Books

In [None]:
books_corr = pd.DataFrame(columns=evaluation, index = algo_names).fillna(0)

In [None]:
for combi in some_combinations:
    if "books" in combi:
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+combi[0]+"_"+eva_str+".csv", index_col=0)
        profile_pop = df["count"]
        for algorithm in algo_names:
            rec_pop = df[algorithm]
            correlation, _ = stats.pearsonr(profile_pop, rec_pop)
            books_corr.loc[algorithm, eva_str] = correlation

In [None]:
books_corr

In [None]:
with open("latex_tables/book_correlations.tex", "w") as tf:
    tf.write(books_corr.to_latex())

### Movies

In [None]:
movies_corr = pd.DataFrame(columns= evaluation, index =algo_names).fillna(0)

In [None]:
for combi in some_combinations:
    if "movies" in combi:
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+combi[0]+"_"+eva_str+".csv", index_col=0)
        profile_pop = df["count"]
        for algorithm in algo_names:
            rec_pop = df[algorithm]
            correlation, _ = stats.pearsonr(profile_pop, rec_pop)
            movies_corr.loc[algorithm, eva_str] = correlation

In [None]:
movies_corr

In [None]:
with open("latex_tables/movie_correlations.tex", "w") as tf:
    tf.write(movies_corr.to_latex())

### Music

In [None]:
music_corr = pd.DataFrame(columns = evaluation, index=algo_names).fillna(0)

In [None]:
for combi in some_combinations:
    if "music" in combi:
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+combi[0]+"_"+eva_str+".csv", index_col=0)
        profile_pop = df["count"]
        for algorithm in algo_names:
            rec_pop = df[algorithm]
            correlation, _ = stats.pearsonr(profile_pop, rec_pop)
            music_corr.loc[algorithm, eva_str] = correlation

In [None]:
music_corr

In [None]:
with open("latex_tables/music_correlations.tex", "w") as tf:
    tf.write(music_corr.to_latex())

## Calculate item coverage

### Books

In [None]:
books_cov = pd.DataFrame(columns=evaluation, index = algo_names).fillna(0)

In [None]:
for combi in some_combinations:
    if "books" in combi:
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+combi[0]+"_"+eva_str+".csv", index_col=0)
        #profile_pop = df["count"]
        for algorithm in algo_names:
            rec_pop = df[algorithm]
            pers = sum(df[algorithm]>0)/len(df)
            books_cov.loc[algorithm, eva_str] = pers

In [None]:
books_cov

In [None]:
with open("latex_tables/book_coverage.tex", "w") as tf:
    tf.write(books_cov.to_latex())

### Movies

In [None]:
movies_cov = pd.DataFrame(columns=evaluation, index = algo_names).fillna(0)

In [None]:
for combi in some_combinations:
    if "movies" in combi:
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+combi[0]+"_"+eva_str+".csv", index_col=0)
        #profile_pop = df["count"]
        for algorithm in algo_names:
            rec_pop = df[algorithm]
            pers = sum(df[algorithm]>0)/len(df)
            movies_cov.loc[algorithm, eva_str] = pers

In [None]:
movies_cov

In [None]:
with open("latex_tables/movie_coverage.tex", "w") as tf:
    tf.write(movies_cov.to_latex())

### Music

In [None]:
music_cov = pd.DataFrame(columns=evaluation, index = algo_names).fillna(0)

In [None]:
for combi in some_combinations:
    if "music" in combi:
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+combi[0]+"_"+eva_str+".csv", index_col=0)
        #profile_pop = df["count"]
        for algorithm in algo_names:
            rec_pop = df[algorithm]
            pers = sum(df[algorithm]>0)/len(df)
            music_cov.loc[algorithm, eva_str] = pers

In [None]:
music_cov

In [None]:
with open("latex_tables/music_coverage.tex", "w") as tf:
    tf.write(music_cov.to_latex())

## Combine tables

In [None]:
movies_corr.columns = ["eva_one_corr", "eva_two_corr", "eva_three_corr"]
music_corr.columns = ["eva_one_corr", "eva_two_corr", "eva_three_corr"]
books_corr.columns = ["eva_one_corr", "eva_two_corr", "eva_three_corr"]

In [None]:
movies_cov.columns = ["eva_one_cov", "eva_two_cov", "eva_three_cov"]
music_cov.columns = ["eva_one_cov", "eva_two_cov", "eva_three_cov"]
books_cov.columns = ["eva_one_cov", "eva_two_cov", "eva_three_cov"]

In [None]:
combined_movies = pd.concat([movies_corr, movies_cov], axis=1)[["eva_one_corr","eva_one_cov","eva_two_corr", "eva_two_cov", 
                                             "eva_three_corr", "eva_three_cov"]]
combined_music = pd.concat([music_corr, music_cov], axis=1)[["eva_one_corr","eva_one_cov","eva_two_corr", "eva_two_cov", 
                                             "eva_three_corr", "eva_three_cov"]]
combined_books = pd.concat([books_corr, books_cov], axis=1)[["eva_one_corr","eva_one_cov","eva_two_corr", "eva_two_cov", 
                                             "eva_three_corr", "eva_three_cov"]]

In [None]:
combined_movies = combined_movies.append(pd.Series(combined_movies.mean(), name="mean"))
combined_music = combined_music.append(pd.Series(combined_music.mean(), name="mean"))
combined_books = combined_books.append(pd.Series(combined_books.mean(), name="mean"))

In [None]:
with open("latex_tables/movies_combined_update.tex", "w") as tf:
    tf.write(combined_movies.to_latex())
with open("latex_tables/music_combined_update.tex", "w") as tf:
    tf.write(combined_music.to_latex())
with open("latex_tables/books_combined_update.tex", "w") as tf:
    tf.write(combined_books.to_latex())

In [None]:
combined_movies

## Plot per algorithm

In [None]:
data_dict = {"movies":"MovieLens1M", "music":"LastFM", "books":"Book-Crossing"}
eva_dict = {"eva_one":"Mod.TrainItems", "eva_two":"UserTest", "eva_three":"TrainItems"}

In [None]:
import matplotlib

matplotlib.rc('xtick', labelsize=20) 
matplotlib.rc('ytick', labelsize=20)

In [None]:
len(algo_names)

In [None]:
for algo in algo_names:
    fig, ax = plt.subplots(nrows=3, ncols=3)
    fig.set_figheight(20)
    fig.set_figwidth(20)

    fig.text(0.5, 0.04, 'Item popularity in profile', ha='center', fontsize='30')
    fig.text(0.04, 0.5, 'Recommendation frequency', va='center', rotation='vertical', fontsize='30')
    axes = ax.flatten()
    i=0
    for combi in some_combinations:
        data = combi[0]
        eva_str = combi[1]
        df = pd.read_csv(results_location+"df_item_dist_cornac_"+data+"_"+eva_str+".csv", index_col=0)
        profile_pop = df["count"]
        alg_pop = df[algo]
        correlation, _ = stats.pearsonr(profile_pop, alg_pop)
        axes[i].plot(profile_pop, alg_pop, "o")
        axes[i].set_title(data_dict[data] + ", "+eva_dict[eva_str], fontsize='25')
        i+=1
    plt.savefig("graphs/"+algo+"_results.png")
    plt.show()

# Second RQ

In [None]:
combi = all_combinations[0]
path = results_location+"low_gap_vals_"+combi[0]+"_"+combi[1]+"_"+combi[2]+".pickle"


In [None]:
pop_gaps = []
for p in pop_notion:
    for v in ["low","med","high"]:
        pop_gaps.append(p+"_"+v)

In [None]:
pop_gaps

### Music

In [None]:
music_GAPs_eva1 = pd.DataFrame(columns=pop_gaps, index = algo_names).fillna(0)
music_GAPs_eva2 = pd.DataFrame(columns=pop_gaps, index = algo_names).fillna(0)
music_GAPs_eva3 = pd.DataFrame(columns=pop_gaps, index = algo_names).fillna(0)

In [None]:
for combi in all_combinations:
    if "music" in combi:
        eva_str = combi[1]
        pop_n = combi[2]
        path_low = results_location+"low_gap_vals_music_"+eva_str+"_"+pop_n+".pickle"
        path_med = results_location+"med_gap_vals_music_"+eva_str+"_"+pop_n+".pickle"
        path_high = results_location+"high_gap_vals_music_"+eva_str+"_"+pop_n+".pickle"
        low_gap_vals = pd.DataFrame([pkl.load(open(path_low,"rb") )[2:]], columns=algo_names)
        med_gap_vals = pd.DataFrame([pkl.load(open(path_med,"rb") )[2:]], columns=algo_names)
        high_gap_vals = pd.DataFrame([pkl.load(open(path_high,"rb") )[2:]], columns=algo_names)
        
        if eva_str=="eva_one":
            for algorithm in algo_names:
                music_GAPs_eva1.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                music_GAPs_eva1.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                music_GAPs_eva1.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]
                        
        elif eva_str=="eva_two":
            for algorithm in algo_names:
                music_GAPs_eva2.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                music_GAPs_eva2.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                music_GAPs_eva2.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]
            
        else:
            for algorithm in algo_names:
                music_GAPs_eva3.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                music_GAPs_eva3.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                music_GAPs_eva3.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]

In [None]:
np.round(music_GAPs_eva1,1)

In [None]:
music_GAPs_eva1.columns =["eva_one_"+x for x in music_GAPs_eva1.columns]
music_GAPs_eva2.columns =["eva_two_"+x for x in music_GAPs_eva2.columns]
music_GAPs_eva3.columns =["eva_three_"+x for x in music_GAPs_eva3.columns]

In [None]:
combined_music_GAPs = pd.concat([music_GAPs_eva1, music_GAPs_eva2, music_GAPs_eva3], axis=1)

In [None]:
combined_music_GAPs = np.round(combined_music_GAPs,1)

In [None]:
with open("latex_tables/music_GAPs_combined.tex", "w") as tf:
    tf.write(combined_music_GAPs.to_latex())

In [None]:
with open("latex_tables/music_GAPs_eva1.tex", "w") as tf:
    tf.write(np.round(music_GAPs_eva1,1).to_latex())
with open("latex_tables/music_GAPs_eva2.tex", "w") as tf:
    tf.write(np.round(music_GAPs_eva2,1).to_latex())
with open("latex_tables/music_GAPs_eva3.tex", "w") as tf:
    tf.write(np.round(music_GAPs_eva3,1).to_latex())

### Books

In [None]:
books_GAPs_eva1 = pd.DataFrame(columns=pop_gaps[:-3], index = algo_names).fillna(0)
books_GAPs_eva2 = pd.DataFrame(columns=pop_gaps[:-3], index = algo_names).fillna(0)
books_GAPs_eva3 = pd.DataFrame(columns=pop_gaps[:-3], index = algo_names).fillna(0)
for combi in all_combinations:
    if "books" in combi:
        eva_str = combi[1]
        pop_n = combi[2]
        path_low = results_location+"low_gap_vals_books_"+eva_str+"_"+pop_n+".pickle"
        path_med = results_location+"med_gap_vals_books_"+eva_str+"_"+pop_n+".pickle"
        path_high = results_location+"high_gap_vals_books_"+eva_str+"_"+pop_n+".pickle"
        low_gap_vals = pd.DataFrame([pkl.load(open(path_low,"rb") )[2:]], columns=algo_names)
        med_gap_vals = pd.DataFrame([pkl.load(open(path_med,"rb") )[2:]], columns=algo_names)
        high_gap_vals = pd.DataFrame([pkl.load(open(path_high,"rb") )[2:]], columns=algo_names)
        
        if eva_str=="eva_one":
            for algorithm in algo_names:
                books_GAPs_eva1.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                books_GAPs_eva1.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                books_GAPs_eva1.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]
                        
        elif eva_str=="eva_two":
            for algorithm in algo_names:
                books_GAPs_eva2.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                books_GAPs_eva2.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                books_GAPs_eva2.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]
            
        else:
            for algorithm in algo_names:
                books_GAPs_eva3.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                books_GAPs_eva3.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                books_GAPs_eva3.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]

In [None]:
books_GAPs_eva1

In [None]:
books_GAPs_eva1.columns =["eva_one_"+x for x in books_GAPs_eva1.columns]
books_GAPs_eva2.columns =["eva_two_"+x for x in books_GAPs_eva2.columns]
books_GAPs_eva3.columns =["eva_three_"+x for x in books_GAPs_eva3.columns]

In [None]:
books_GAPs_eva1

In [None]:
with open("latex_tables/books_GAPs_eva1.tex", "w") as tf:
    tf.write(np.round(books_GAPs_eva1,1).to_latex())
with open("latex_tables/books_GAPs_eva2.tex", "w") as tf:
    tf.write(np.round(books_GAPs_eva2,1).to_latex())
with open("latex_tables/books_GAPs_eva3.tex", "w") as tf:
    tf.write(np.round(books_GAPs_eva3,1).to_latex())

### Movies

In [None]:
movies_GAPs_eva1 = pd.DataFrame(columns=pop_gaps[:-3], index = algo_names).fillna(0)
movies_GAPs_eva2 = pd.DataFrame(columns=pop_gaps[:-3], index = algo_names).fillna(0)
movies_GAPs_eva3 = pd.DataFrame(columns=pop_gaps[:-3], index = algo_names).fillna(0)
for combi in all_combinations:
    if "movies" in combi:
        eva_str = combi[1]
        pop_n = combi[2]
        path_low = results_location+"low_gap_vals_movies_"+eva_str+"_"+pop_n+".pickle"
        path_med = results_location+"med_gap_vals_movies_"+eva_str+"_"+pop_n+".pickle"
        path_high = results_location+"high_gap_vals_movies_"+eva_str+"_"+pop_n+".pickle"
        low_gap_vals = pd.DataFrame([pkl.load(open(path_low,"rb") )[2:]], columns=algo_names)
        med_gap_vals = pd.DataFrame([pkl.load(open(path_med,"rb") )[2:]], columns=algo_names)
        high_gap_vals = pd.DataFrame([pkl.load(open(path_high,"rb") )[2:]], columns=algo_names)
        
        if eva_str=="eva_one":
            for algorithm in algo_names:
                movies_GAPs_eva1.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                movies_GAPs_eva1.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                movies_GAPs_eva1.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]
                        
        elif eva_str=="eva_two":
            for algorithm in algo_names:
                movies_GAPs_eva2.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                movies_GAPs_eva2.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                movies_GAPs_eva2.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]
            
        else:
            for algorithm in algo_names:
                movies_GAPs_eva3.at[algorithm, pop_n+"_low"] = low_gap_vals[algorithm][0]
                movies_GAPs_eva3.at[algorithm, pop_n+"_med"] = med_gap_vals[algorithm][0]
                movies_GAPs_eva3.at[algorithm, pop_n+"_high"] = high_gap_vals[algorithm][0]

In [None]:
movies_GAPs_eva1.columns =["eva_one_"+x for x in movies_GAPs_eva1.columns]
movies_GAPs_eva2.columns =["eva_two_"+x for x in movies_GAPs_eva2.columns]
movies_GAPs_eva3.columns =["eva_three_"+x for x in movies_GAPs_eva3.columns]

In [None]:
movies_GAPs_eva3

In [None]:
with open("latex_tables/movies_GAPs_eva1.tex", "w") as tf:
    tf.write(np.round(movies_GAPs_eva1,1).to_latex())
with open("latex_tables/movies_GAPs_eva2.tex", "w") as tf:
    tf.write(np.round(movies_GAPs_eva2,1).to_latex())
with open("latex_tables/movies_GAPs_eva3.tex", "w") as tf:
    tf.write(np.round(movies_GAPs_eva3,1).to_latex())