# Libraries

In [38]:
import pickle as pkl
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

In [2]:
algo_name = "UserKNN"
data_strategies = [
    "uniformly_random",
    "popularity_good",
    "popularity_bad",
    "popularity_good_for_bp_ur",
    "popularity_bad_for_bp_ur",
]

# Analysis

## LKPY

In [3]:
algo_versions = {
    "UserKNN": [
        {"min_nbrs": 1, "min_sim": 0},
        {"min_nbrs": 2, "min_sim": 0},
        {"min_nbrs": 1, "min_sim": -1},
        {"min_nbrs": 2, "min_sim": -1},
    ],
}

In [4]:
versions = algo_versions[algo_name]

In [5]:
file_location = "experimental_results/" + algo_name + "/"

In [6]:
results = []
for data_strategy in data_strategies:
    for args in versions:
        file = open(file_location + data_strategy + "_" + str(args) + ".pkl", "rb")
        result = pkl.load(file)
        results.append(result)

In [7]:
stringed_versions = [str(args) for args in versions]

In [8]:
# Initialize empty lists for the two halves
center = []
min_nbrs = []

# Split the strings and populate the lists
for s in stringed_versions:
    parts = s.split(", ")

    min_nbrs.append(parts[0].split(" ")[-1])
    min_sim.append(parts[1].split(" ")[-1].split("}")[0])

In [9]:
over_common = ["False"]

In [10]:
index = pd.MultiIndex.from_product(
    [data_strategies, min_sim, min_nbrs, over_common],
    names=["DataStrategy", "MinimumSimilarity", "MinimumNeighbours", "OverCommon"],
).drop_duplicates()

In [11]:
results_lkpy = pd.DataFrame(results, index=index)

In [12]:
results_lkpy

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pop_corr,RMSE,NDCG,ARP,ave_PL,ACLT,AggDiv
DataStrategy,MinimumSimilarity,MinimumNeighbours,OverCommon,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
uniformly_random,0,1,False,"(0.10110466641885223, 3.420017408632659e-17)",3.624,0.001672,0.002723,-12.827119,2.745972,0.340124
uniformly_random,0,2,False,"(0.6149052740925817, 0.0)",3.464,0.00482,0.005088,65.440196,1.071823,0.595145
uniformly_random,-1,1,False,"(0.018078800576386483, 0.13261414434946503)",3.502,0.000794,0.002059,-32.284513,3.762652,0.400231
uniformly_random,-1,2,False,"(0.41822601518091973, 3.097865376849624e-291)",3.352,0.003087,0.003528,21.251888,1.935323,0.680826
popularity_good,0,1,False,"(0.5519601545626697, 0.0)",1.04,0.023409,0.027255,632.300469,0.03901,0.080046
popularity_good,0,2,False,"(0.5620146001718291, 0.0)",1.026,0.024699,0.027052,591.965761,0.033348,0.172518
popularity_good,-1,1,False,"(0.5955040638109448, 0.0)",1.188,0.01902,0.020677,426.621401,0.59665,0.196648
popularity_good,-1,2,False,"(0.6143667114262298, 0.0)",1.19,0.020719,0.022179,447.618276,0.196923,0.339257
popularity_bad,0,1,False,"(0.0247094973664384, 0.0398221911175794)",1.044,0.001087,0.00213,-35.765099,3.014387,0.511776
popularity_bad,0,2,False,"(0.16076461069841927, 2.7024931290302406e-41)",1.034,0.004129,0.002811,-13.099813,1.781204,0.688195


In [13]:
lkpy_dict_detailed = {}
for data_strategy in data_strategies:
    lkpy_detailed_results = []
    for args in versions:
        file = open(file_location + 'correct_detailed_per_item_'+data_strategy + "_" + str(args) + ".pkl", "rb")
        result = pkl.load(file)
        lkpy_detailed_results.append(result)
    lkpy_dict_detailed[data_strategy] = lkpy_detailed_results

## Cornac

In [15]:
algo_versions = {
    "CornacUserKNN": [{"center": True}],
}
algo_name = "CornacUserKNN"

In [16]:
versions = algo_versions[algo_name]

In [17]:
file_location = "experimental_results/" + algo_name + "/"

In [18]:
results = []
for data_strategy in data_strategies:
    for args in versions:
        file = open(file_location + data_strategy + "_" + str(args) + ".pkl", "rb")
        result = pkl.load(file)
        results.append(result)

In [19]:
stringed_versions = [str(args) for args in versions]

In [20]:
# Initialize empty lists for the two halves
min_nbrs = []
min_sim = []
# Split the strings and populate the lists
for s in stringed_versions:
    parts = s.split(": ")
    min_nbrs.append(parts[-1].split("}")[0])
    min_sim.append("-1")

In [21]:
over_common = ["True"]

In [22]:
index = pd.MultiIndex.from_product(
    [data_strategies, min_sim, min_nbrs, over_common],
    names=["DataStrategy", "MinimumSimilarity", "MinimumNeighbours", "OverCommon"],
).drop_duplicates()

In [23]:
results_cornac = pd.DataFrame(results, index=index)

In [24]:
results_cornac.index = results_cornac.index.set_levels(
    results_cornac.index.levels[1].str.replace("-1", "1"), level=2
)

In [25]:
cornac_dict_detailed = {}
for data_strategy in data_strategies:
    cornac_detailed_results = []
    for args in versions:
        file = open(file_location + 'correct_detailed_per_item_'+data_strategy + "_" + str(args) + ".pkl", "rb")
        result = pkl.load(file)
        cornac_detailed_results.append(result)
    cornac_dict_detailed[data_strategy] = cornac_detailed_results

# Merge

In [26]:
user_knn_metrics = (
    pd.concat([results_lkpy, results_cornac])
    .reset_index()
    .sort_values(
        ["DataStrategy", "MinimumSimilarity", "OverCommon", "MinimumNeighbours"]
    )
    .set_index(["DataStrategy", "MinimumSimilarity", "OverCommon", "MinimumNeighbours"])
    .reindex(data_strategies, level=0)
)

In [27]:
metrics_order = ["pop_corr", "ARP", "ave_PL", "ACLT", "AggDiv", "RMSE", "NDCG"]
user_knn_metrics = user_knn_metrics[metrics_order]

In [28]:
user_knn_metrics = user_knn_metrics.rename(
    columns={"pop_corr": "PopCorr", "ave_PL": "PL", "ACLT": "APLT", "NDCG": "NDCG@10"}
)

In [29]:
user_knn_metrics = user_knn_metrics.rename(
    index={
        "uniformly_random": "Scenario 1",
        "popularity_good": "Scenario 2",
        "popularity_bad": "Scenario 3",
        "popularity_good_for_bp_ur": "Scenario 4",
        "popularity_bad_for_bp_ur": "Scenario 5",
    }
)

In [30]:
user_knn_metrics = user_knn_metrics.reindex(["1", "2", "5", "10"], level=3)

In [31]:
user_knn_metrics['RealPopCorr'] = user_knn_metrics.PopCorr.apply(lambda x: x[0])
user_knn_metrics['Significance'] = user_knn_metrics.PopCorr.apply(lambda x: True if x[1]<0.005 else False)
user_knn_metrics['PopCorr'] = user_knn_metrics.RealPopCorr 
user_knn_metrics = user_knn_metrics.drop('RealPopCorr', axis=1)

In [32]:
user_knn_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PopCorr,ARP,PL,APLT,AggDiv,RMSE,NDCG@10,Significance
DataStrategy,MinimumSimilarity,OverCommon,MinimumNeighbours,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Scenario 1,-1,False,1,0.018079,0.002059,-32.284513,3.762652,0.400231,3.502,0.000794,False
Scenario 1,-1,False,2,0.418226,0.003528,21.251888,1.935323,0.680826,3.352,0.003087,True
Scenario 1,-1,True,1,0.003513,0.001959,-35.745549,3.923726,0.400376,3.337384,0.000798,False
Scenario 1,0,False,1,0.101105,0.002723,-12.827119,2.745972,0.340124,3.624,0.001672,True
Scenario 1,0,False,2,0.614905,0.005088,65.440196,1.071823,0.595145,3.464,0.00482,True
Scenario 2,-1,False,1,0.595504,0.020677,426.621401,0.59665,0.196648,1.188,0.01902,True
Scenario 2,-1,False,2,0.614367,0.022179,447.618276,0.196923,0.339257,1.19,0.020719,True
Scenario 2,-1,True,1,0.604307,0.015046,305.197289,1.600833,0.238405,1.149931,0.012895,True
Scenario 2,0,False,1,0.55196,0.027255,632.300469,0.03901,0.080046,1.04,0.023409,True
Scenario 2,0,False,2,0.562015,0.027052,591.965761,0.033348,0.172518,1.026,0.024699,True


In [33]:
print(user_knn_metrics.drop(['APLT','Significance'], axis=1).round(3).to_latex())

\begin{tabular}{llllrrrrrr}
\toprule
           &   &       &   &  PopCorr &    ARP &       PL &  AggDiv &   RMSE &  NDCG@10 \\
DataStrategy & MinimumSimilarity & OverCommon & MinimumNeighbours &          &        &          &         &        &          \\
\midrule
Scenario 1 & -1 & False & 1 &    0.018 &  0.002 &  -32.285 &   0.400 &  3.502 &    0.001 \\
           &   &       & 2 &    0.418 &  0.004 &   21.252 &   0.681 &  3.352 &    0.003 \\
           &   & True & 1 &    0.004 &  0.002 &  -35.746 &   0.400 &  3.337 &    0.001 \\
           & 0 & False & 1 &    0.101 &  0.003 &  -12.827 &   0.340 &  3.624 &    0.002 \\
           &   &       & 2 &    0.615 &  0.005 &   65.440 &   0.595 &  3.464 &    0.005 \\
Scenario 2 & -1 & False & 1 &    0.596 &  0.021 &  426.621 &   0.197 &  1.188 &    0.019 \\
           &   &       & 2 &    0.614 &  0.022 &  447.618 &   0.339 &  1.190 &    0.021 \\
           &   & True & 1 &    0.604 &  0.015 &  305.197 &   0.238 &  1.150 &    0.013 \\
     

In [34]:
import pickle

with open("metrics_combined/all_user_knn.pkl", "wb") as f:
    pickle.dump(user_knn_metrics.drop("APLT", axis=1).round(3), f)

# Significance tests

## 1. Average Recommendation Popularity
recommendation values

In [40]:
def highest_average(df_list, column_name = 'recommendation'):
    highest_average = -10^6
    highest_i = -1
    for i in range(len(df_list)):
        df = df_list[i]
        mean = np.mean(df[column_name].values)
        #print('mean', mean)
        if mean > highest_average:
            highest_average = mean
            highest_i = i
    print(highest_average, highest_i)
    return highest_average, highest_i

In [41]:
def mannwhitneyu_test(df_list, alt = 'greater', column_name = 'recommendation'):
    # find the highest average 
    ha, hi = highest_average(df_list, column_name)
    inds_df_list = list(range(len(df_list)))
    to_test_inds = inds_df_list[:hi] + inds_df_list[hi+1:]
    print(to_test_inds)
    df1 = df_list[hi]
    pvalues = []
    for ind in to_test_inds:
        df2 = df_list[ind]
        x = df1[column_name].values
        y = df2[column_name].values
        pvalue = mannwhitneyu(x,y, alternative = alt)[1]
        pvalues.append(pvalue)
    return pvalues # pvalues for all comparisons

In [43]:
for data_strategy in data_strategies:
    mf_results = cornac_dict_detailed[data_strategy]+lkpy_dict_detailed[data_strategy]
    print(data_strategy)
    print(mannwhitneyu_test(mf_results))

uniformly_random
0.005088297816220186 2
[0, 1, 3, 4]
[0.0, 0.0, 0.0, 0.0]
popularity_good
0.02725447453965516 1
[0, 2, 3, 4]
[0.0, 1.0, 0.0, 0.0]
popularity_bad
0.008210152178837216 4
[0, 1, 2, 3]
[1.216560222449918e-85, 0.0, 1.3385517982792135e-307, 4.687712667542103e-27]
popularity_good_for_bp_ur
0.013732259968235063 2
[0, 1, 3, 4]
[0.0, 0.0, 0.0, 0.0]
popularity_bad_for_bp_ur
0.0046237003449702396 2
[0, 1, 3, 4]
[0.0, 0.0, 0.0, 3.604033785181491e-49]


## 2. Popularity Lift

In [44]:
for data_strategy in data_strategies:
    mf_results = cornac_dict_detailed[data_strategy]+lkpy_dict_detailed[data_strategy]
    for df in mf_results:
        df['popularity_lift'] = (df['recommendation']-df['profile'])/df['profile']*100
    print(mannwhitneyu_test(mf_results, column_name = 'popularity_lift')) 

65.43830590265048 2
[0, 1, 3, 4]
[0.0, 2.188941089036555e-258, 0.0, 7.72400099276303e-108]
632.2824421667686 1
[0, 2, 3, 4]
[0.0, 0.015732623224987266, 2.9669910892081154e-238, 2.7443075046547103e-154]
192.12799583548045 4
[0, 1, 2, 3]
[1.240145514463707e-36, 0.0, 4.0901184951709577e-150, 2.279407800573683e-12]
297.0477047416728 2
[0, 1, 3, 4]
[0.0, 8.09724058167196e-264, 0.0, 0.0]
57.96919856927836 4
[0, 1, 2, 3]
[1.3016235627164633e-199, 8.480723094484795e-77, 1.0, 1.0303011344870575e-153]
