# Statistical Significance Test

## Imports

In [36]:
import os
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Set Variables

In [37]:
BASE_PATH = "external_repo"

In [38]:
csv_files = [
    "statistical_significans_random.csv",
    "statistical_significans_popular.csv",
    "statistical_significans_GRU4REC.csv",
    "statistical_significans_GRU4REC_concat.csv",
    "statistical_significans_encode.csv",
    "statistical_significans_concat.csv",
    "statistical_significans_auto.csv"
]

In [39]:
model_names = ["Random", "Popular","GRU4REC", "GRU4REC Concat", "Cross-sessions Encode", "Cross-sessions Concat", "Cross-sessions Auto"]

In [40]:
column_names = ['hit', 'precision', 'recall', 'RR', 'AP']

columns_data = {col: {} for col in column_names}

for model_name, csv_file in zip(model_names, csv_files):
    file_path = os.path.join(BASE_PATH, csv_file)
    
    if not os.path.exists(file_path):
        print(f"Warning: File not found - {file_path}")
        continue
        
    data = pd.read_csv(file_path)
    
    for column_name in column_names:
        if column_name in data.columns:
            columns_data[column_name][model_name] = data[column_name].to_numpy().flatten()
        else:
            print(f"Warning: Column '{column_name}' not found in {file_path}")

## McNemar Test for Hitrate

McNemar: Used to test differences in the proportions of dichotomous variables in dependent samples.

Null Hypothesis (H0): There is no difference in the proportions of hits between the two models.

Alternative Hypothesis (H1): There is a difference in the proportions of hits.

In [41]:
def mc_nemar(column_data_dict, reference_model):
    
    mcnemar_results = {}
    
    for model, model_hit in columns_data['hit'].items():
        if model == reference_model:
            continue
    
        contingency_table = pd.crosstab(column_data_dict['hit'][reference_model], model_hit)
        result = mcnemar(contingency_table, exact=True, correction=True)
        
        mcnemar_results[model] = {
            'statistic': result.statistic,
            'p-value': result.pvalue
        }
        
    return mcnemar_results

# Print results
mcnemar_results = mc_nemar(columns_data, "Cross-sessions Encode")

for model, result in mcnemar_results.items():
    print(f"McNemar Test for {model}:")
    print(f"  Statistic: {result['statistic']}")
    print(f"  p-value: {result['p-value']}")
    print("-" * 50)

McNemar Test for Random:
  Statistic: 270.0
  p-value: 0.0
--------------------------------------------------
McNemar Test for Popular:
  Statistic: 180.0
  p-value: 1.626963566737916e-207
--------------------------------------------------
McNemar Test for GRU4REC:
  Statistic: 238.0
  p-value: 6.036948239752762e-171
--------------------------------------------------
McNemar Test for GRU4REC Concat:
  Statistic: 242.0
  p-value: 1.0996763960890227e-139
--------------------------------------------------
McNemar Test for Cross-sessions Concat:
  Statistic: 146.0
  p-value: 0.13108217841061792
--------------------------------------------------
McNemar Test for Cross-sessions Auto:
  Statistic: 186.0
  p-value: 0.8368169204941016
--------------------------------------------------


## One-way ANOVA for other measures

ANOVA: Used to test for overall differences between groups.

Null Hypothesis (H0): All group means are equal.

Alternative Hypothesis (H1): At least one group mean is different.

If the p-value from the ANOVA test is below the significance level, it indicates that at least one group mean is different. However, to identify which specific groups differ, a post hoc test is needed.

In [42]:
def oneway_anova(column_data_dict, measure):
   
    anova_results = {}

    all_models = list(column_data_dict[measure].values())

    f_stat, p_value = f_oneway(*all_models)
    anova_results['statistic'] = f_stat
    anova_results['p_value'] = p_value

    # Perform post hoc test if Anova result is significant
    if p_value < 0.05:
        combined_data = np.concatenate(all_models)
        model_labels = np.concatenate([np.repeat(model, len(data)) for model, data in column_data_dict[measure].items()])

        df = pd.DataFrame({
            measure: combined_data,
            'label': model_labels
        })
    
        tukey_result = pairwise_tukeyhsd(endog=df[measure],
                                          groups=df['label'],
                                          alpha=0.05)
        
        anova_results['tukey_hsd'] = tukey_result.summary()

    return anova_results

# Print results
anova_results = {}
for column_name in column_names[1:]:  # skip 'hit' column
    anova_results[column_name] = oneway_anova(columns_data, column_name)

for model, result in anova_results.items():
    print(f"Oneway Anova for {model}:")
    print(f"  Statistic: {result['statistic']}")
    print(f"  p-value: {result['p_value']}")
    print(f"  post hoc test: {result['tukey_hsd']}")


Oneway Anova for precision:
  Statistic: 862.4568570328175
  p-value: 0.0
  post hoc test:                Multiple Comparison of Means - Tukey HSD, FWER=0.05                
        group1                group2        meandiff p-adj   lower   upper  reject
----------------------------------------------------------------------------------
  Cross-sessions Auto Cross-sessions Concat  -0.0012 0.9999 -0.0111  0.0087  False
  Cross-sessions Auto Cross-sessions Encode  -0.0006    1.0 -0.0105  0.0093  False
  Cross-sessions Auto               GRU4REC  -0.0744    0.0 -0.0843 -0.0645   True
  Cross-sessions Auto        GRU4REC Concat  -0.0637    0.0 -0.0736 -0.0538   True
  Cross-sessions Auto               Popular  -0.0855    0.0 -0.0954 -0.0756   True
  Cross-sessions Auto                Random  -0.1933    0.0 -0.2032 -0.1834   True
Cross-sessions Concat Cross-sessions Encode   0.0006    1.0 -0.0094  0.0105  False
Cross-sessions Concat               GRU4REC  -0.0732    0.0 -0.0831 -0.0633   T