In [1]:
import json
import numpy as np
import pyperclip
from scipy.stats import ttest_ind_from_stats, ttest_rel, wilcoxon,ttest_ind,ttest_rel
import os
os.chdir(r"C:\Users\robin\Documents\Cours\Poly\Recherche\FIBI_Recherche")
from FIBI.data.analyse_cpp_results.tsp.extract_infos import parse_key, parse_metadata, DataCategory, corresp_impr_class
import pandas as pd
from scipy.stats import ranksums, mannwhitneyu
import plotly.express as px
import bs4
from itertools import product
# https://www.youtube.com/watch?v=CIbJSX-biu0 for ressources

In [2]:
LnPts = ['20','50','100']
Lassign = ['RAND','IMPR']
threshold_n = 100

## Investigate Wilcoxon computation

In [3]:
from h5py import File
data_rd = []
with File("data/data_algo_same/table1_exp.hdf5", "r") as f:
    for k,v in f["metadata"].items():
        dico_key = parse_key(k)
        v = np.copy(v)
        dico_value = {"key_dataset":k,**parse_metadata(v)}
        data_rd.append(dico_value)
df_rd = pd.DataFrame(data_rd)
data_impr = []
with File("data/data_algo_same/table1_exp_impr.hdf5", "r") as f:
    for k,v in f["metadata"].items():
        dico_key = parse_key(k)
        v = np.copy(v)
        dico_value = {"key_dataset":k,**parse_metadata(v)}
        data_impr.append(dico_value)
df_impr = pd.DataFrame(data_impr)
df_impr['INIT'] = 'IMPR'
df_rd['INIT'] = 'RAND'

df = pd.concat([df_rd,df_impr])


In [50]:
df_1 = df.copy()
df_1.loc[:,"ratio"] = df_1.loc[:,"final_cost"]/df_1.loc[:,"init_cost"]
df_1.sort_values(by=['SEED_POINTS','SEED_ASSIGN'])
backup_stats = {}
backup_soups = {}
for init in Lassign:
    backup_stats[init] = {}
    backup_soups[init] = {}
    dico_df = {}
    for nPts in LnPts:
        backup_stats[init][nPts] = {}
        backup_soups[init][nPts] = {}
        soup = bs4.BeautifulSoup("""<html><head><link rel="stylesheet" href="style.css"></head><body></body></html>""")
        dico_df[nPts] = {}
        for algo in ['FICL','BICL']:
            dico_df[nPts][algo] = df_1.query(f'INIT=="{init}" & NUM_POINTS == {nPts} & IMPR_CLASS == "{algo}"')
            dico_df[nPts][algo].sort_values(by=['SEED_POINTS','SEED_ASSIGN'])
        soup.html.body.append(bs4.BeautifulSoup(f"<h2>{nPts} points, initialisation {init}</h2>"))
        table = soup.new_tag(f"table")
        table.attrs['id'] = "data"
        table.attrs['cellspacing'] = "0"
        table.attrs['cellpadding'] = "0"
        L = ["coutFinalFICL/coutInitial","coutFinalBICL/coutInitial"]
        for col in [*L,L[0]+'-'+L[1]]:
            colTag = soup.new_tag('th')
            colTag.string = col
            table.append(colTag)
        FICL = np.array(dico_df[nPts]['FICL']['final_cost']/dico_df[nPts]['FICL']['init_cost'])
        BICL = np.array(dico_df[nPts]['BICL']['final_cost']/dico_df[nPts]['BICL']['init_cost'])
        if len(FICL) != len(BICL):
            raise Exception(f'diff size: {len(FICL)}, {len(BICL)}')
        FICL_BICL = FICL-BICL
        for records in zip(FICL,BICL,FICL_BICL):
            tr = soup.new_tag('tr')
            for r in records:
                td = soup.new_tag('td')
                td.string = f"{r}"
                tr.append(td)
            table.append(tr)
        soup.html.body.append(table)
        
        df_stats = pd.DataFrame(FICL_BICL,columns=['difference'])
        descr = df_stats.describe().applymap(lambda x: f"{x:0.2e}").reset_index().to_dict(orient='records')
        backup_stats[init][nPts] = descr
        backup_soups[init][nPts] = soup
div_stats = bs4.BeautifulSoup('<div id="stats"></div>')
for init in Lassign:
    table = bs4.BeautifulSoup('<table></table>')
    header = bs4.BeautifulSoup(f'<tr><th colspan="{len(LnPts)+1}">{init}</th></tr><tr><th></th>'+''.join([f'<th>{n}</th>' for n in LnPts])+'</tr>')
    table.table.attrs['cellspacing'] = "0"
    table.table.attrs['cellpadding'] = "0"
    table.table.append(header)
    f = lambda L: [e['difference'] for e in L]
    stats_v = [f(L) for L in [backup_stats[init][k] for k in LnPts]]
    for stat,vals in zip([e['index'] for e in backup_stats[init][LnPts[0]]], zip(*stats_v)):
        tr = bs4.BeautifulSoup(f'<tr><th>{stat}</th></tr>')
        for v in vals:
            td = bs4.BeautifulSoup(f'<td>{v}</td>')
            tr.tr.append(td)
        table.table.append(tr)
    div_stats.div.append(table)

for init in Lassign:
    for nPts in LnPts:
        backup_soups[init][nPts].body.append(bs4.BeautifulSoup(str(div_stats)))
        with open(f'FIBI/data/ttest/out_tsp_{nPts}pts_init_{init}.html','w') as f:
            f.write(str(backup_soups[init][nPts]))

## Compute Wilcoxon test pvalues

In [51]:
drop = ['key_dataset','num_iter','num_iter_glob','duration']
df_2 = df.copy()
df_2 = df_2[[c for c in df_2.columns if c not in drop]]
nPoints = [int(e) for e in df_2['NUM_POINTS'].unique() if int(e) <= threshold_n]
nPoints.sort()
soup = bs4.BeautifulSoup(f'<html><head><link rel="stylesheet" href="style2.css"></head><body><h2>P-values</h2><table cellspacing="0" cellpadding="0"></table></body></html>')
soup.html.body.table.append(bs4.BeautifulSoup(f'<tr><th colspan="2"></th><th colspan="{len(nPoints)}">Number of points</th>'+'</tr>'))
soup.html.body.table.append(bs4.BeautifulSoup('<tr><th>INIT</th><th>Values provided to the test</th>'+''.join([f'<th>{n}</th>' for n in nPoints])+'</tr>'))
for init in (Lassign):
    dico_stats = {'wilcoxon_direct':{},'wilcoxon_ratio':{}}
    for i,n in enumerate(nPoints):
        tmp = df.query(f'INIT == "{init}" & NUM_POINTS == {n} & IT_ORDER=="CURR"')
        tmpBI = tmp.query('IMPR_CLASS=="BICL"')
        tmpBI.sort_values(by=['SEED_POINTS','SEED_ASSIGN'])
        tmpFI = tmp.query('IMPR_CLASS=="FICL"')
        tmpFI.sort_values(by=['SEED_POINTS','SEED_ASSIGN'])
        res_wil = wilcoxon(tmpBI['final_cost'],tmpFI['final_cost'])
        res_ratio = wilcoxon(np.array(tmpBI['final_cost'])/np.array(tmpBI['init_cost']),np.array(tmpFI['final_cost'])/np.array(tmpFI['init_cost']))
        dico_stats['wilcoxon_direct'][n] = res_wil
        dico_stats['wilcoxon_ratio'][n] = res_ratio
    for i,(dic_id,name) in enumerate([['direct','Final cost'],['ratio','Final cost/Init cost']]):
        header = ""
        if i==0:
            header = f'<th rowspan="2">{init}</th>'
        tr = bs4.BeautifulSoup(f'<tr>{header}<th class="testUsed">{name}</th></tr>')
        for n,v in (dico_stats['wilcoxon_'+dic_id].items()):
            tr.tr.append(bs4.BeautifulSoup(f"<td class='{'more' if v.pvalue > 5e-2 else 'less'}'>{v.pvalue:.2e}</td>"))
        soup.html.body.table.append(tr)

with open(f'FIBI/data/ttest/out_tsp_pvalues.html','w') as f:
    f.write(str(soup))


In [27]:
NINSTANCES = 200
NDECIMALS = 5
df_3 = df.copy()
df_3.loc[:,"ratio"] = df_3.loc[:,"final_cost"]/df_3.loc[:,"init_cost"]
df_3 = df_3.query(f'NUM_POINTS == {50} & INIT == "RAND"')
df_FI = df_3.query(f'IMPR_CLASS == "FICL"')
df_FI.sort_values(by=['SEED_POINTS','SEED_ASSIGN'])
df_BI = df_3.query(f'IMPR_CLASS == "BICL"')
df_BI.sort_values(by=['SEED_POINTS','SEED_ASSIGN'])
a = np.array(df_FI['ratio']).round(decimals=NDECIMALS)[:NINSTANCES]
b = np.array(df_BI['ratio']).round(decimals=NDECIMALS)[:NINSTANCES]
t = wilcoxon(a,b)
print(f"pvalue wilcoxon:{t.pvalue:.5f}")
t = ttest_rel(a,b)
print(f"pvalue ttest:{t.pvalue:.5f}")
print('\n'.join([f'{e}' for e in np.array(df_BI['ratio'])[:NINSTANCES]]))


pvalue wilcoxon:0.38074
pvalue ttest:0.40459
0.22382491342402225
0.22659682360625052
0.2083975338486606
0.22943444552517503
0.24554052185633307
0.23633135413639442
0.235274144727219
0.21575184089200494
0.21436808313222244
0.22792518886005597
0.21311829427907325
0.2287840808490815
0.21811108832057902
0.2036002579853265
0.2317420227534506
0.2168574757198828
0.23738567667318514
0.23671324949224215
0.224689202401681
0.21151081613446313
0.22563953131627568
0.21963117660889483
0.224469006522919
0.24314073828443966
0.2077620149565672
0.23856526787864205
0.2041730821123827
0.2468900218841716
0.24647570778501776
0.22781668213410616
0.23783494636221456
0.19819856227131333
0.23054449663987828
0.22026823050514133
0.187554406672977
0.22425672383908632
0.2362667688077249
0.234151553658087
0.2624863083546101
0.22884037749120462
0.2531557521351184
0.2453162243083289
0.23798387347338423
0.20209188680324233
0.27304145750380965
0.2197258913398709
0.22983446739010813
0.243883936841912
0.25110809138015744
