# Statistikk fra de ulike eksperimentene

In [1]:
from scipy import stats

## Experiment 1

In [2]:
from experiments_data import exp1
ttest = stats.ttest_ind(exp1.stl_aurocs, exp1.mtl_aurocs)
wilc = stats.wilcoxon(exp1.stl_aurocs, exp1.mtl_aurocs)
print("T-test: ", ttest)
print("Wilcoxon: ", wilc)

T-test:  Ttest_indResult(statistic=-1.0180293907272238, pvalue=0.3221530748941993)
Wilcoxon:  WilcoxonResult(statistic=20.0, pvalue=0.4921875)


Ettersom eksperimentet ikke ser på å sammenligne STL og MTL så gir ikke denne T-Testen noe særlig mening. Hvis vi skulle sammenlignet med noe i dette eksperimentet ville det vært at opp mot majority classen og sagt at vi måler sannsynligheten for at vi treffer så ofte som vi gjør. Sjekk Santi sin matte evt.

## Experiment 3 (4)

Vi ønsker å se på hvorvidt MTL med kul loss kan brukes istedenfor vanlig, kjedelig loss.

Hvis vi vil kan vi ha null-hypotese om at det ikke er noe forskjell mellom MTL-R og MTL-FE.
En alternativ hypotese er at det er forskjell, men litt usikker på om du har to ulike alternativer:
MTL-R bedre enn MTL-FE eller MTL-FE bedre enn MTL-R.
Eller om det bare er én alternativ: forskjell.

T-Testen her vil fortelle oss sannsynligheten for dataen er random generert eller noe annet med lik sannsynlighet har inntruffet.

In [3]:
from experiments_data import exp4

ttest3 = stats.ttest_ind(exp4.mtl_fe_aurocs, exp1.mtl_aurocs)
pearson = stats.pearsonr(exp4.mtl_fe_aurocs, exp1.mtl_aurocs)
def avg(lst):
    return sum(lst)/len(lst)

stats.mannwhitneyu(exp1.mtl_aurocs, exp4.mtl_fe_aurocs)
stats.wilcoxon(exp1.mtl_aurocs, exp4.mtl_fe_aurocs)

WilcoxonResult(statistic=8.0, pvalue=0.048828125)

T-testen sier at ting ikke er tilfeldig uavhengig: Disse er ikke relaterbare.
Pearson-koeffisienten sier at dataen er negativt relatert, men 30% sannsynlighet at det bare er random. Så er de egentlig enige i at det ting ikke henger sammen her? Altså at vi _må_ rejecte null-hypotesen om at de er relatert?

#### NB!
Og svaret er **JUPP!** Sannsynligvis fordi de har kjørt på helt ulike parametere, her må jo mtl fra exp1 kjøres på nytt, med samme parametere som mtl_fe!

## Experiment 4 (5)

Vi ønsker å se om MTL > STL på sparse data. Her _kunne_ man kanskje hatt en null-hypotese om at det ikke er noen forskjell. Den alternative hypotesen kan da være at MTL > STL på alt fra 0% noise til 49% noise. 50% vil de være tilnærmet like gode ettersom MTL sine fordeler utjevnes av ekstra støy. Igjen er spørsmålet om alternativ hypotese, hva hvis STL er bedre enn MTL?
Er det kanskje bedre å bare drøfte?

T-Testen vil allikevel fortelle oss hvor sikre vi er på dataen, som vi da kan diskutere `mean` av.

In [4]:
from experiments_data import exp5

exp5data = zip(exp5.stl_aurocs, exp5.mtl_aurocs)

for index, (r, fe) in enumerate(exp5data):
    print(f"{index * 5} percentage")
    rounded_r = [round(val, 3) for val in r]
    rounded_fe = [round(val, 3) for val in fe]
    print(rounded_r)
    print(rounded_fe)
    ttest5 = stats.ttest_ind(rounded_r, rounded_fe)
    print("ttest p ",ttest5.pvalue)
    mannwhitney5 = stats.mannwhitneyu(r,fe)
    print(mannwhitney5)
    avg_difference = abs((sum(r)/len(r)) - (sum(fe)/len(r)))
    print("avg diff ", avg_difference)
    wilc5 = stats.wilcoxon(r, fe)
    print(wilc5)

0 percentage
[0.99, 0.99, 0.989, 0.989, 0.989, 0.989, 0.989, 0.989, 0.989, 0.988]
[0.98, 0.979, 0.979, 0.978, 0.976, 0.979, 0.979, 0.978, 0.979, 0.978]
ttest p  3.7883519922248783e-16
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
avg diff  0.010574495792388894
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
5 percentage
[0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997]
[0.992, 0.992, 0.994, 0.994, 0.995, 0.993, 0.995, 0.995, 0.993, 0.994]
ttest p  4.404278729178303e-08
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
avg diff  0.003061264753341675
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
10 percentage
[0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997, 0.997]
[0.998, 0.998, 0.998, 0.998, 0.998, 0.997, 0.998, 0.997, 0.998, 0.997]
ttest p  0.0002309309557541889
MannwhitneyuResult(statistic=7.0, pvalue=0.0006574723348566069)
avg diff  0.0003843069076538974
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
15 perc

## Experiment 5 (6)

Her tester vi 5 og 10 prosent av dataen med 10% til 50% støy.

In [5]:
from experiments_data import exp6

exp5data5size = zip(exp6.stl_5s, exp6.mtl_5s)
exp5data10size = zip(exp6.stl_10s, exp6.mtl_10s)

for index, (r, fe) in enumerate(exp5data5size):
    print(f"{index+1}0% noise 5 size")
    mann5 = stats.mannwhitneyu(r, fe)
    print(mann5)
    wilc6 = stats.wilcoxon(r, fe)
    print(wilc6)
    
print()
for index, (r, fe) in enumerate(exp5data10size):
    print(f"{index+1}0% noise 10 size")
    mann5 = stats.mannwhitneyu(r, fe, alternative="less")
    print(mann5)
    wilc6 = stats.wilcoxon(r, fe)
    print(wilc6)

10% noise 5 size
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
20% noise 5 size
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
30% noise 5 size
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
40% noise 5 size
MannwhitneyuResult(statistic=2.0, pvalue=0.00016491926038899677)
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
50% noise 5 size
MannwhitneyuResult(statistic=17.0, pvalue=0.007009638556979976)
WilcoxonResult(statistic=3.0, pvalue=0.009765625)

10% noise 10 size
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
20% noise 10 size
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-05)
WilcoxonResult(statistic=0.0, pvalue=0.001953125)
30% noise 10 size
MannwhitneyuResult(statistic=0.0, pvalue=9.133589555477501e-0

## Experiment 6 (7)


In [6]:
from experiments_data import exp7

exp6data = zip(exp7.r_aurocs, exp7.fe_aurocs)

for index, (r, fe) in enumerate(exp6data):
    print(f"{index+1}0% noise")
    ttest7 = stats.ttest_ind(r, fe)
    print(ttest7.pvalue)
    mann7 = stats.mannwhitneyu(r, fe)
    print(mann7)
    wilc7 = stats.wilcoxon(r, fe)
    print(wilc7)

10% noise
0.13968506006174902
MannwhitneyuResult(statistic=35.0, pvalue=0.13651816987559418)
WilcoxonResult(statistic=5.0, pvalue=0.01953125)
20% noise
0.5977760582488791
MannwhitneyuResult(statistic=36.0, pvalue=0.15374472830934066)
WilcoxonResult(statistic=20.0, pvalue=0.4921875)
30% noise
0.5017440509415498
MannwhitneyuResult(statistic=40.0, pvalue=0.23633779675579358)
WilcoxonResult(statistic=20.0, pvalue=0.4921875)
40% noise
0.9036138653761041
MannwhitneyuResult(statistic=50.0, pvalue=0.4849249884965778)
WilcoxonResult(statistic=25.0, pvalue=0.845703125)
50% noise
0.10352697524083414
MannwhitneyuResult(statistic=34.0, pvalue=0.12066079650859002)
WilcoxonResult(statistic=21.0, pvalue=0.556640625)


Ingen av de kan rejecte null-hypotesen, altså er det sannsynlig at ting henger sammen.

## Fast DeLong from Yandex School of Data Analysis

According to this guide, https://glassboxmedicine.com/2020/02/04/comparing-aucs-of-machine-learning-models-with-delongs-test/, you can compare two AU(RO)C's through DeLong's test. 

In [7]:
import pandas as pd
import numpy as np
import scipy.stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    return order, label_1_count


def delong_roc_variance(ground_truth, predictions):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return calc_pvalue(aucs, delongcov)