In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from main.utils import load_experiments
import os
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

### Pre-loading names

In [217]:
methods = [ 'random', 'entropy', 'bald', 'badge', 'max_logdet_S', 'empirical_covariance']
datasets = ['mnist', 'dirty_mnist', 'fashion_mnist', 'repeated_mnist', 'imagenet']

# naming of methods
method_names = {
    'badge': 'BADGE',
    'random': 'Random',
    'entropy': 'Entropy',
    'bald': 'BALD',
    'max_logdet_S': 'Similarity Matrix',
    'empirical_covariance': 'Empirical Covariance',
    'similarity_kmeans': 'Similarity KMeans',
}


# naming of datasets
dataset_names = {
    'mnist': 'MNIST',
    'fashion_mnist': 'Fashion-MNIST',
    'dirty_mnist': 'Dirty-MNIST',
    'repeated_mnist': 'Repeated-MNIST',
    'imagenet': 'ImageNet',
}

n_seeds = 5

### Testing Files

In [196]:
for method in methods:
    for dataset in datasets:
        for j in range(n_seeds):
            try:
                path = f'{method}_{dataset}_20_to_200_B=10_{j + 1}'
                df = load_experiments([path])
            except FileNotFoundError:
                print(f'File {path} not found')
                continue

In [201]:
results = {}
for method in methods:
    results[method] = {}
    for dataset in datasets:
        experiment_ids = [f'{method}_{dataset}_20_to_200_B=10_{i + 1}' for i in range(n_seeds)]
        res = load_experiments(experiment_ids)
        accuracies = [[result[1]['test_accs'] for result in res]]
        results[method][dataset] = accuracies

# put all results in a single dataframe
data = []
for method in methods:
    for dataset in datasets:
        for result in results[method][dataset]:
            for j in range(n_seeds):
                data.append({
                    'method': method,
                    'dataset': dataset,
                    'seed': j,
                    'accuracy': result[j][-1],
                })

df = pd.DataFrame(data)

## Wilcoxon

In [224]:
wilcoxon_results = {}

# for each dataset, apply the Wilcoxon rank test to compare the methods
for test_method in ['max_logdet_S', 'empirical_covariance']:
    comp_methods = methods.copy()
    comp_methods.remove(test_method)
    
    for dataset in datasets:
        for i, method in enumerate(comp_methods):
            test = stats.wilcoxon(df[(df['method'] == method) & (df['dataset'] == dataset)]['accuracy'],
                                  df[(df['method'] == test_method) & (df['dataset'] == dataset)]['accuracy'],
                                  alternative='less')
            
            if dataset not in wilcoxon_results:
                wilcoxon_results[dataset] = {}
            if method not in wilcoxon_results[dataset]:
                wilcoxon_results[dataset][method] = {}
            wilcoxon_results[dataset][method][test_method] = test

# save the results in dataframe
data = []
for dataset in datasets:
    for method in methods:
        for test_method in ['max_logdet_S', 'empirical_covariance']:
            if test_method == method:
                continue
            data.append({
                'dataset': dataset,
                'method': method,
                'test_method': test_method,
                'p_value': wilcoxon_results[dataset][method][test_method].pvalue,
            })

wilcoxon_results = pd.DataFrame(data)

wilcoxon_final = []
for data_set in datasets:

    subset_ranktest = wilcoxon_results[wilcoxon_results['dataset'] == data_set]
    subset_ranktest.drop(columns='dataset', inplace=True)

    # replace the method names with the full names
    subset_ranktest['method'] = subset_ranktest['method'].map(method_names)
    subset_ranktest['test_method'] = subset_ranktest['test_method'].map(method_names)

    subset_ranktest.index = subset_ranktest['test_method'] + ' vs ' +  subset_ranktest['method'] 
    subset_ranktest.drop(columns=['method', 'test_method'], inplace=True)
    subset_ranktest.columns = [dataset_names[data_set]]

    wilcoxon_final.append(subset_ranktest)

wilcoxon_final = pd.concat(wilcoxon_final, axis=1)
wilcoxon_final.drop(labels='Empirical Covariance vs Similarity Matrix', axis=0, inplace=True)
wilcoxon_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_ranktest.drop(columns='dataset', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_ranktest['method'] = subset_ranktest['method'].map(method_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_ranktest['test_method'] = subset_ranktest['test_method'].map(method_names)
A value is trying to be set on a copy of a slice from 

Unnamed: 0,MNIST,Dirty-MNIST,Fashion-MNIST,Repeated-MNIST,ImageNet
Similarity Matrix vs Random,0.03125,0.03125,0.6875,0.03125,0.03125
Empirical Covariance vs Random,0.03125,0.0625,0.90625,0.03125,0.5
Similarity Matrix vs Entropy,0.96875,0.03125,0.03125,0.15625,0.03125
Empirical Covariance vs Entropy,0.78125,0.03125,0.21875,0.0625,1.0
Similarity Matrix vs BALD,0.0625,0.03125,0.21875,0.15625,0.0625
Empirical Covariance vs BALD,0.03125,0.03125,0.5,0.40625,1.0
Similarity Matrix vs BADGE,0.03125,0.03125,0.84375,0.03125,0.03125
Empirical Covariance vs BADGE,0.03125,0.03125,0.9375,0.03125,0.84375
Similarity Matrix vs Empirical Covariance,0.96875,0.09375,0.15625,0.40625,0.03125


## Friedman + Nemenyi

In [225]:
# Reshape DataFrame for the Friedman Test
pivot_df = df.pivot_table(index=['dataset', 'seed'], columns='method', values='accuracy')

# Perform Friedman Test
friedman_stat, p_value = friedmanchisquare(*[pivot_df[method] for method in pivot_df.columns])

print(f"Friedman Test Statistic: {friedman_stat}, p-value: {p_value}")

# Perform Nemenyi Post-hoc Test
nemenyi_results = sp.posthoc_nemenyi_friedman(pivot_df.values)

# Format Nemenyi test results
method_names = pivot_df.columns
nemenyi_df = pd.DataFrame(nemenyi_results)
nemenyi_df.columns = method_names
nemenyi_df.index = method_names

# Output Nemenyi test results
print("\nNemenyi Post-hoc Test Results (p-values):")
print(nemenyi_df)

# Highlight significant results
def highlight_significant(p):
    return 'background-color: blue' if p < 0.05 else ''

styled_nemenyi_df = nemenyi_df.style.applymap(highlight_significant)

# Display styled DataFrame
styled_nemenyi_df


Friedman Test Statistic: 25.571428571428555, p-value: 0.000108038898314909

Nemenyi Post-hoc Test Results (p-values):
method                   badge      bald  empirical_covariance   entropy  \
method                                                                     
badge                 1.000000  0.766754              0.363156  0.900000   
bald                  0.766754  1.000000              0.900000  0.900000   
empirical_covariance  0.363156  0.900000              1.000000  0.679434   
entropy               0.900000  0.900000              0.679434  1.000000   
max_logdet_S          0.001613  0.104666              0.410222  0.011371   
random                0.900000  0.410222              0.104666  0.854075   

method                max_logdet_S    random  
method                                        
badge                     0.001613  0.900000  
bald                      0.104666  0.410222  
empirical_covariance      0.410222  0.104666  
entropy                   0.011371  0.

  styled_nemenyi_df = nemenyi_df.style.applymap(highlight_significant)


method,badge,bald,empirical_covariance,entropy,max_logdet_S,random
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
badge,1.0,0.766754,0.363156,0.9,0.001613,0.9
bald,0.766754,1.0,0.9,0.9,0.104666,0.410222
empirical_covariance,0.363156,0.9,1.0,0.679434,0.410222,0.104666
entropy,0.9,0.9,0.679434,1.0,0.011371,0.854075
max_logdet_S,0.001613,0.104666,0.410222,0.011371,1.0,0.001
random,0.9,0.410222,0.104666,0.854075,0.001,1.0
