In [1]:
import pandas as pd
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel


Read the recommendation performance after 5 runs

In [17]:
nrms_random = pd.read_csv ("results/Random-NRMS.csv", sep = ";")
nrms_greedy = pd.read_csv ("results/Greedy-NRMS.csv", sep = ";")

lstur_random = pd.read_csv ("results/Random-lstur.csv", sep = ";")
lstur_greedy = pd.read_csv ("results/Greedy-lstur.csv", sep = ";")

NPA_random = pd.read_csv ("results/Random-NPA.csv", sep = ";")
NPA_greedy = pd.read_csv ("results/Greedy-NPA.csv", sep = ";")

In [30]:
# Convert values to numeric (same as before)
def convert_to_numeric(df):
    df['ndcg@5'] = df['ndcg@5'].str.replace(',', '.').astype(float)
    df['ndcg@10'] = df['ndcg@10'].str.replace(',', '.').astype(float)
    df['MRR'] = df['MRR'].str.replace(',', '.').astype(float)
    return df

# Calculate mean, std, and perform paired t-tests
def calculate_mean_std_and_ttests(df):
    # Group by steps and pre-processing, then calculate mean and std
    grouped_df = df.groupby(['steps', 'Pre-processing']).agg({
        'ndcg@5': ['mean', 'std'],
        'ndcg@10': ['mean', 'std'],
        'MRR': ['mean', 'std']
    }).reset_index()
    
    # Flatten the MultiIndex columns
    grouped_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in grouped_df.columns.values]

    # Initialize lists to store t-test results
    ttest_results = {
        'steps': [],
        'Pre-processing': [],
        'ndcg@5_p-value': [],
        'ndcg@10_p-value': [],
        'MRR_p-value': []
    }

    # Perform paired t-tests comparing Pre-processing = 0 with other Pre-processing levels
    for step in df['steps'].unique():
        df_step = df[df['steps'] == step]
        baseline = df_step[df_step['Pre-processing'] == 0]
        
        for pre in [1, 2, 5, 10]:
            comparison = df_step[df_step['Pre-processing'] == pre]
            
            if not baseline.empty and not comparison.empty:
                ttest_results['steps'].append(step)
                ttest_results['Pre-processing'].append(pre)
                
                # Perform t-tests for ndcg@5, ndcg@10, and MRR
                ttest_results['ndcg@5_p-value'].append(ttest_rel(baseline['ndcg@5'], comparison['ndcg@5'])[1])
                ttest_results['ndcg@10_p-value'].append(ttest_rel(baseline['ndcg@10'], comparison['ndcg@10'])[1])
                ttest_results['MRR_p-value'].append(ttest_rel(baseline['MRR'], comparison['MRR'])[1])
    
    ttest_df = pd.DataFrame(ttest_results)
    
    return grouped_df, ttest_df

# Load and convert your dataframes
nrms_random = convert_to_numeric(pd.read_csv("results/Random-NRMS.csv", sep=";"))
nrms_greedy = convert_to_numeric(pd.read_csv("results/Greedy-NRMS.csv", sep=";"))

lstur_random = convert_to_numeric(pd.read_csv("results/Random-lstur.csv", sep=";"))
lstur_greedy = convert_to_numeric(pd.read_csv("results/Greedy-lstur.csv", sep=";"))

NPA_random = convert_to_numeric(pd.read_csv("results/Random-NPA.csv", sep=";"))
NPA_greedy = convert_to_numeric(pd.read_csv("results/Greedy-NPA.csv", sep=";"))

# Apply the function to each dataframe
nrms_random_stats, nrms_random_ttests = calculate_mean_std_and_ttests(nrms_random)
nrms_greedy_stats, nrms_greedy_ttests = calculate_mean_std_and_ttests(nrms_greedy)

lstur_random_stats, lstur_random_ttests = calculate_mean_std_and_ttests(lstur_random)
lstur_greedy_stats, lstur_greedy_ttests = calculate_mean_std_and_ttests(lstur_greedy)

NPA_random_stats, NPA_random_ttests = calculate_mean_std_and_ttests(NPA_random)
NPA_greedy_stats, NPA_greedy_ttests = calculate_mean_std_and_ttests(NPA_greedy)

# Save the results to CSV files
nrms_random_stats.to_csv("results/NRMS_Random_MeanStd.csv", index=False)
nrms_greedy_stats.to_csv("results/NRMS_Greedy_MeanStd.csv", index=False)

lstur_random_stats.to_csv("results/LSTUR_Random_MeanStd.csv", index=False)
lstur_greedy_stats.to_csv("results/LSTUR_Greedy_MeanStd.csv", index=False)

NPA_random_stats.to_csv("results/NPA_Random_MeanStd.csv", index=False)
NPA_greedy_stats.to_csv("results/NPA_Greedy_MeanStd.csv", index=False)

# Save the t-test results to CSV files
nrms_random_ttests.to_csv("results/NRMS_Random_TTests.csv", index=False)
nrms_greedy_ttests.to_csv("results/NRMS_Greedy_TTests.csv", index=False)

lstur_random_ttests.to_csv("results/LSTUR_Random_TTests.csv", index=False)
lstur_greedy_ttests.to_csv("results/LSTUR_Greedy_TTests.csv", index=False)

NPA_random_ttests.to_csv("results/NPA_Random_TTests.csv", index=False)
NPA_greedy_ttests.to_csv("results/NPA_Greedy_TTests.csv", index=False)

lstur_greedy_ttests

Unnamed: 0,steps,Pre-processing,ndcg@5_p-value,ndcg@10_p-value,MRR_p-value
0,1-Step,1,0.605497,0.377036,0.596476
1,1-Step,2,0.931042,0.354286,0.925295
2,1-Step,5,1.0,0.323092,1.0
3,1-Step,10,0.624612,0.381648,0.790428
4,2-Step,1,0.011292,0.51763,0.029867
5,2-Step,2,0.038837,0.647525,0.048949
6,2-Step,5,0.015378,0.662842,0.032501
7,2-Step,10,0.053647,0.610385,0.102577


In [31]:
lstur_greedy_stats

Unnamed: 0,steps,Pre-processing,ndcg@5_mean,ndcg@5_std,ndcg@10_mean,ndcg@10_std,MRR_mean,MRR_std
0,1-Step,0,0.3248,0.00507,0.3664,0.043535,0.2968,0.005263
1,1-Step,1,0.323,0.003674,0.3852,0.002775,0.2952,0.003114
2,1-Step,2,0.3244,0.005857,0.3872,0.004087,0.2964,0.004722
3,1-Step,5,0.3248,0.003114,0.3872,0.002683,0.2968,0.00228
4,1-Step,10,0.3236,0.005771,0.3864,0.004722,0.2962,0.00502
5,2-Step,0,0.3248,0.00507,0.3664,0.043535,0.2968,0.005263
6,2-Step,1,0.3158,0.002168,0.38,0.001,0.29,0.000707
7,2-Step,2,0.3128,0.004438,0.3764,0.002966,0.2862,0.003834
8,2-Step,5,0.3126,0.001817,0.3754,0.001342,0.2874,0.001517
9,2-Step,10,0.3138,0.004604,0.3766,0.003578,0.2884,0.004506


In [67]:
algo= ["NRMS", "NPA", "LSTUR"][1]

data= pd.read_csv (f"MIND/mind_version1/MIND_Demo_Version1/results/calibration/00_res_submissionRecSys/calibration_scores_{algo}.csv")



In [68]:
df = data[data['k'] == 10]
df


Unnamed: 0,k,version_name,p,algo,impr_index,kl_divergence,js_divergence,coverage,gini_index
0,10,original,0,NPA,1,3.653149,0.520516,1314,1.000000
1,10,original,0,NPA,2,2.460909,0.578069,1314,0.999999
2,10,original,0,NPA,3,5.648978,0.689366,1314,0.999999
3,10,original,0,NPA,4,1.052701,0.336639,1314,1.000000
4,10,original,0,NPA,5,0.000000,0.000000,1314,1.000000
...,...,...,...,...,...,...,...,...,...
67837,10,Obf,10,NPA,7534,5.967344,0.761802,1287,0.999999
67838,10,Obf,10,NPA,7535,2.187354,0.509268,1287,1.000000
67839,10,Obf,10,NPA,7536,5.170677,0.745608,1287,0.999999
67840,10,Obf,10,NPA,7537,6.438674,0.854167,1287,0.999999


In [69]:
baseline_group = 0

# Define the version names to compare (Add and Obf)
version_names = ['Add', 'Obf']
comparison_groups = [1, 2, 5, 10]

# Loop over the version names (Add and Obf)
for version in version_names:
    print(f"\nPerforming t-tests for version: {version}\n")
    
    for comparison_group in comparison_groups:
        # Select data for the comparison and baseline groups
        comparison_data = df[(df['p'] == comparison_group) & (df['version_name'] == version)]
        baseline_data = df[(df['p'] == baseline_group) & (df['version_name'] == 'original')]
        
        # Match the number of observations by sampling from the larger dataset
        min_len = min(len(comparison_data), len(baseline_data))
        comparison_data = comparison_data.head(7538)  # n=min_len, random_state=42
        baseline_data = baseline_data.head(7538)  # n=min_len, random_state=42

        # Perform paired t-test for each metric
        for metric in ['kl_divergence', 'js_divergence', 'coverage']:
            t_statistic, p_value = ttest_rel(comparison_data[metric], baseline_data[metric])
            
            # Print results
            print(f"Comparison Group (p = {comparison_group}, version = {version}) vs Baseline Group (p = {baseline_group}, version = original):")
            print(f"{metric.capitalize()}: t-statistic = {t_statistic}, p-value = {p_value}")
            print()


Performing t-tests for version: Add

Comparison Group (p = 1, version = Add) vs Baseline Group (p = 0, version = original):
Kl_divergence: t-statistic = -0.19299152961963176, p-value = 0.8469708018083155

Comparison Group (p = 1, version = Add) vs Baseline Group (p = 0, version = original):
Js_divergence: t-statistic = -1.0031923667707094, p-value = 0.31580026086700647

Comparison Group (p = 1, version = Add) vs Baseline Group (p = 0, version = original):
Coverage: t-statistic = inf, p-value = 0.0

Comparison Group (p = 2, version = Add) vs Baseline Group (p = 0, version = original):
Kl_divergence: t-statistic = -2.8644133981874895, p-value = 0.004189368095526626

Comparison Group (p = 2, version = Add) vs Baseline Group (p = 0, version = original):
Js_divergence: t-statistic = -1.5800640707158002, p-value = 0.11413415712975085

Comparison Group (p = 2, version = Add) vs Baseline Group (p = 0, version = original):
Coverage: t-statistic = -inf, p-value = 0.0

Comparison Group (p = 5, v

  res = hypotest_fun_out(*samples, **kwds)
