# Libraries

In [119]:
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu

In [120]:
data_strategy = "epinion"  # change depending on which dataset we want to examine

# Analysis

## LKPY

In [121]:
algo_versions = {
    "UserKNN": [
        {"min_nbrs": 1, "min_sim": 0},
        {"min_nbrs": 2, "min_sim": 0},
        # {'min_nbrs':5, 'min_sim':0},
        # {"min_nbrs": 10, "min_sim": 0},
        {"min_nbrs": 1, "min_sim": -1},
        {"min_nbrs": 2, "min_sim": -1},
        # {'min_nbrs':5, 'min_sim':-1},
        # {"min_nbrs": 10, "min_sim": -1},
    ],
}

In [122]:
algo_name = "UserKNN"
versions = algo_versions[algo_name]

In [123]:
file_location = "experimental_results/" + algo_name + "/"

In [124]:
results = []
for args in versions:
    file = open(file_location + data_strategy + "_" + str(args) + ".pkl", "rb")
    result = pkl.load(file)
    results.append(result)

In [125]:
stringed_versions = [str(args) for args in versions]

In [126]:
# Initialize empty lists for the two halves
min_nbrs = []
min_sim = []

# Split the strings and populate the lists
for s in stringed_versions:
    parts = s.split(", ")

    min_nbrs.append(parts[0].split(" ")[-1])
    min_sim.append(parts[1].split(" ")[-1].split("}")[0])

In [127]:
over_common = ["False"]

In [128]:
index = pd.MultiIndex.from_product(
    [min_sim, min_nbrs, over_common],
    names=["MinimumSimilarity", "MinimumNeighbours", "OverCommon"],
).drop_duplicates()

In [129]:
results_lkpy = pd.DataFrame(results, index=index)

In [130]:
results_lkpy

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pop_corr,RMSE,NDCG,ARP,ave_PL,ACLT,AggDiv
MinimumSimilarity,MinimumNeighbours,OverCommon,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,False,"(0.04319430702198934, 2.4227248200698596e-122)",1.148,0.000248,0.000645,65.526848,1.488664,0.039699
0,2,False,"(0.15316607258667803, 0.0)",1.108,0.000502,0.001322,165.929066,0.0,0.047412
-1,1,False,"(0.02309613734937969, 2.9662866087745307e-36)",1.212,3.6e-05,0.000462,36.538499,3.24351,0.032338
-1,2,False,"(0.17296228008058673, 0.0)",1.168,0.000261,0.001141,176.223702,0.0,0.054999


In [131]:
lkpy_detailed_results = []
for args in versions:
    file = open(file_location + 'correct_detailed_per_item_'+data_strategy + "_" + str(args) + ".pkl", "rb")
    result = pkl.load(file)
    lkpy_detailed_results.append(result)

## Cornac

In [132]:
algo_versions = {
    "CornacUserKNN": [{"center": True}],
}
algo_name = "CornacUserKNN"

In [133]:
versions = algo_versions[algo_name]

In [134]:
file_location = "experimental_results/" + algo_name + "/"

In [135]:
results = []
for args in versions:
    file = open(file_location + data_strategy + "_" + str(args) + ".pkl", "rb")
    result = pkl.load(file)
    results.append(result)

In [136]:
stringed_versions = [str(args) for args in versions]

In [137]:
# Initialize empty lists for the two halves
min_nbrs = []
min_sim = []
# Split the strings and populate the lists
for s in stringed_versions:
    parts = s.split(": ")
    min_nbrs.append(parts[-1].split("}")[0])
    min_sim.append("-1")

In [138]:
over_common = ["True"]

In [139]:
index = pd.MultiIndex.from_product(
    [min_sim, min_nbrs, over_common],
    names=["MinimumSimilarity", "MinimumNeighbours", "OverCommon"],
).drop_duplicates()

In [140]:
results_cornac = pd.DataFrame(results, index=index)

In [141]:
results_cornac.index = results_cornac.index.set_levels(
    results_cornac.index.levels[1].str.replace("True", "1"), level=1
)

In [142]:
results_cornac

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pop_corr,RMSE,NDCG,ARP,ave_PL,ACLT,AggDiv
MinimumSimilarity,MinimumNeighbours,OverCommon,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,1,True,"(0.019650605461640824, 1.0504277226394118e-26)",1.153543,2.8e-05,0.000412,20.7889,3.699153,0.036284


In [143]:
cornac_detailed_results = []
for args in versions:
    file = open(file_location + 'correct_detailed_per_item_'+data_strategy + "_" + str(args) + ".pkl", "rb")
    result = pkl.load(file)
    cornac_detailed_results.append(result)

In [144]:
versions

[{'center': True}]

# Merge

In [145]:
user_knn_metrics = (
    pd.concat([results_lkpy, results_cornac])
    .reset_index()
    .sort_values(["MinimumSimilarity", "OverCommon", "MinimumNeighbours"])
    .set_index(["MinimumSimilarity", "OverCommon", "MinimumNeighbours"])
)

In [146]:
metrics_order = ["pop_corr", "ARP", "ave_PL", "ACLT", "AggDiv", "RMSE", "NDCG"]
user_knn_metrics = user_knn_metrics[metrics_order]

In [147]:
user_knn_metrics = user_knn_metrics.rename(
    columns={"pop_corr": "PopCorr", "ave_PL": "PL", "ACLT": "APLT", "NDCG": "NDCG@10"}
).reindex(["1", "2", "5", "10"], level=2)

In [148]:
user_knn_metrics['RealPopCorr'] = user_knn_metrics.PopCorr.apply(lambda x: x[0])
user_knn_metrics['Significance'] = user_knn_metrics.PopCorr.apply(lambda x: True if x[1]<0.005 else False)
user_knn_metrics['PopCorr'] = user_knn_metrics.RealPopCorr 
user_knn_metrics = user_knn_metrics.drop('RealPopCorr', axis=1)

In [149]:
user_knn_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PopCorr,ARP,PL,APLT,AggDiv,RMSE,NDCG@10,Significance
MinimumSimilarity,OverCommon,MinimumNeighbours,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-1,False,1,0.023096,0.000462,36.538499,3.24351,0.032338,1.212,3.6e-05,True
-1,False,2,0.172962,0.001141,176.223702,0.0,0.054999,1.168,0.000261,True
-1,True,1,0.019651,0.000412,20.7889,3.699153,0.036284,1.153543,2.8e-05,True
0,False,1,0.043194,0.000645,65.526848,1.488664,0.039699,1.148,0.000248,True
0,False,2,0.153166,0.001322,165.929066,0.0,0.047412,1.108,0.000502,True


In [150]:
print(user_knn_metrics.drop(['APLT','Significance'], axis=1).round(3).to_latex())

\begin{tabular}{lllrrrrrr}
\toprule
  &       &   &  PopCorr &    ARP &       PL &  AggDiv &   RMSE &  NDCG@10 \\
MinimumSimilarity & OverCommon & MinimumNeighbours &          &        &          &         &        &          \\
\midrule
-1 & False & 1 &    0.023 &  0.000 &   36.538 &   0.032 &  1.212 &    0.000 \\
  &       & 2 &    0.173 &  0.001 &  176.224 &   0.055 &  1.168 &    0.000 \\
  & True & 1 &    0.020 &  0.000 &   20.789 &   0.036 &  1.154 &    0.000 \\
0 & False & 1 &    0.043 &  0.001 &   65.527 &   0.040 &  1.148 &    0.000 \\
  &       & 2 &    0.153 &  0.001 &  165.929 &   0.047 &  1.108 &    0.001 \\
\bottomrule
\end{tabular}



In [151]:
import pickle

with open("metrics_combined/"+data_strategy+"_all_user_knn.pkl", "wb") as f:
    pickle.dump(user_knn_metrics.round(3), f)

# Significance tests

## 1. Average Recommendation Popularity
recommendation values

In [152]:
mf_results = cornac_detailed_results+lkpy_detailed_results

In [158]:
def highest_average(df_list, column_name = 'recommendation'):
    highest_average = -10^6
    highest_i = -1
    for i in range(len(df_list)):
        df = df_list[i]
        mean = np.mean(df[column_name].values)
        print('mean', mean)
        if mean > highest_average:
            highest_average = mean
            highest_i = i
    print(highest_average, highest_i)
    return highest_average, highest_i

In [159]:
def mannwhitneyu_test(df_list, alt = 'greater', column_name = 'recommendation'):
    # find the highest average 
    ha, hi = highest_average(df_list, column_name)
    inds_df_list = list(range(len(df_list)))
    to_test_inds = inds_df_list[:hi] + inds_df_list[hi+1:]
    print(to_test_inds)
    df1 = df_list[hi]
    pvalues = []
    for ind in to_test_inds:
        df2 = df_list[ind]
        x = df1[column_name].values
        y = df2[column_name].values
        pvalue = mannwhitneyu(x,y, alternative = alt)[1]
        pvalues.append(pvalue)
    return pvalues # pvalues for all comparisons

In [160]:
mannwhitneyu_test(mf_results) 

mean 0.000412353394733847
mean 0.0006446839513334677
mean 0.0013219377475777593
mean 0.0004624228343330949
mean 0.00114118929231133
0.0013219377475777593 2
[0, 1, 3, 4]


[0.0, 0.0, 0.0, 5.27329056084729e-285]

## 2. Popularity Lift

In [161]:
for df in mf_results:
    df['popularity_lift'] = (df['recommendation']-df['profile'])/df['profile']*100

In [162]:
mannwhitneyu_test(mf_results, column_name = 'popularity_lift') 

mean 20.78888848955298
mean 65.52538953014442
mean 165.9233399385714
mean 36.53551366046039
mean 176.2273062118962
176.2273062118962 4
[0, 1, 2, 3]


[0.0, 1.1990665840226018e-152, 1.0, 0.0]