In [6]:
import pandas as pd
from scipy.stats import ttest_rel
import os
import numpy as np

# File paths for topk = 5
files_topk_5 = {
    "original": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.0_original.csv",
    "0.1_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.1_Add.csv",
    "0.01_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.01_Add.csv",
    "0.1_Obf": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.1_Obf.csv",
    "0.02_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.02_Add.csv",
    "0.02_Obf": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.02_Obf.csv",
    "0.05_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.05_Add.csv",
    "0.05_Obf": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-5_0.05_Obf.csv"
}

# File paths for topk = 10
files_topk_10 = {
    "original": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.0_original.csv",
    "0.1_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.1_Add.csv",
    "0.01_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.01_Add.csv",
    "0.1_Obf": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.1_Obf.csv",
    "0.02_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.02_Add.csv",
    "0.02_Obf": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.02_Obf.csv",
    "0.05_Add": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.05_Add.csv",
    "0.05_Obf": "/RecSys_News/goodbook/results/Random/version_RecSys/book_calibration_results-10_0.05_Obf.csv"
}


In [None]:
def read_csv(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        print(f"Warning: No such file or directory: '{filename}'")
        return None

def preprocess_values(df, metric):
    """
    Preprocess the divergence values, replacing 'inf' with 0.
    """
    def parse_value(value):
        try:
            value_dict = eval(value)
            return value_dict.get('BPR', np.nan)  # Assume 'BPR' key for simplification
        except:
            return np.nan

    df[metric] = df[metric].apply(parse_value)
    df[metric] = pd.to_numeric(df[metric], errors='coerce')  # Convert to numeric, forcing errors to NaN
    df[metric] = df[metric].replace([np.inf, -np.inf], 0)  # Replace inf values with 0
    return df

In [30]:


def perform_paired_ttest(df1, df2, metric, algorithm):
    # Ensure that both dataframes have the same users
    common = pd.merge(df1[['user', 'Algorithm', metric]], df2[['user', 'Algorithm', metric]], 
                      on=['user', 'Algorithm'], suffixes=('_1', '_2'))
    
    if common.empty:
        print(f"No common data found for t-test in {metric} for algorithm {algorithm}.")
        return float('nan')

    # Filter by algorithm
    common = common[common['Algorithm'].isin([algorithm])]
    
    if common.empty:
        print(f"No data found for algorithm {algorithm}.")
        return float('nan')

    # Drop rows with NaN values
    common = common.dropna()

    values1 = common[f'{metric}_1'].values
    values2 = common[f'{metric}_2'].values
    
    if len(values1) != len(values2):
        print(f"Error: Mismatched lengths for t-test in {metric} for algorithm {algorithm}.")
        return float('nan')

    try:
        return ttest_rel(values1, values2).pvalue
    except Exception as e:
        print(f"Error during t-test for {metric} for algorithm {algorithm}: {e}")
        return float('nan')

# Collect results
results = []

for topk, files in zip([5, 10], [files_topk_5, files_topk_10]):
    original_df = read_csv(files["original"])
    
    if original_df is None:
        print(f"Skipping topk {topk} due to missing original data")
        continue

    # Preprocess original data
    original_df = preprocess_values(original_df, 'KL_div')
    original_df = preprocess_values(original_df, 'JS_div')

    for key, file_path in files.items():
        if key == "original":
            continue
        
        modified_df = read_csv(file_path)
        if modified_df is None:
            continue
        
        # Preprocess modified data
        modified_df = preprocess_values(modified_df, 'KL_div')
        modified_df = preprocess_values(modified_df, 'JS_div')
        
        # Get unique algorithms from the original data
        algorithms = original_df['Algorithm'].unique()
        
        for algorithm in algorithms:
            for metric in ['KL_div', 'JS_div']:
                p_value = perform_paired_ttest(original_df, modified_df, metric, algorithm)
                results.append({
                    'topk': topk,
                    'p_obfuscation': key.split('_')[0],
                    'preprocess': key.split('_')[1],
                    'algorithm': algorithm,
                    'metric': metric,
                    'p_value': p_value
                })

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results)
results_df.to_csv('RecSys_News/goodbook/results/Random/version_RecSys/paired_ttest_results.csv', index=False)

# Save results to CSV
results_df.head (100)

No data found for algorithm ItemKNN.
No data found for algorithm ItemKNN.
No data found for algorithm UserKNN.
No data found for algorithm UserKNN.


Unnamed: 0,topk,p_obfuscation,preprocess,algorithm,metric,p_value
0,5,0.1,Add,BPR,KL_div,4.515158e-97
1,5,0.1,Add,BPR,JS_div,9.131298e-265
2,5,0.1,Add,implicitMF,KL_div,
3,5,0.1,Add,implicitMF,JS_div,
4,5,0.1,Add,pop,KL_div,
...,...,...,...,...,...,...
95,10,0.02,Add,UserKNN,JS_div,
96,10,0.02,Obf,BPR,KL_div,4.417556e-81
97,10,0.02,Obf,BPR,JS_div,6.317711e-226
98,10,0.02,Obf,implicitMF,KL_div,
