In [1]:
import pandas as pd

# Example of the Shapiro-Wilk Normality Test
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon

In [None]:
COLUMNS_TEST = ['eval_loss', 'eval_rouge1', 'eval_rouge2', 'eval_rougeL',
        'eval_rougeLsum', 'eval_bleu']

In [21]:
# PYTHON 

py_results_zs = pd.read_csv('reports/analysis_report_ZS_test_CodeT5_bootstrap_py/results_bootstrap.csv')
py_results_zs = py_results_zs[COLUMNS_TEST].head(34)

py_results_ft = pd.read_csv('reports/analysis_report_test_CodeT5_bootstrap_py/results_bootstrap.csv')
py_results_ft = py_results_ft[COLUMNS_TEST].head(34)


# R 

r_results_zs = pd.read_csv('reports/analysis_report_ZS_test_CodeT5_bootstrap_r/results_bootstrap.csv')
r_results_zs = r_results_zs[COLUMNS_TEST].head(34)

r_results_ft = pd.read_csv('reports/analysis_report_test_CodeT5_bootstrap_r/results_bootstrap.csv')
r_results_ft = r_results_ft[COLUMNS_TEST].head(34)


# MATLAB 

m_results_zs = pd.read_csv('reports/analysis_report_ZS_test_CodeT5_bootstrap_m/results_bootstrap.csv')
m_results_zs = m_results_zs[COLUMNS_TEST].head(34)

m_results_ft = pd.read_csv('reports/analysis_report_test_CodeT5_bootstrap_m/results_bootstrap.csv')
m_results_ft = m_results_ft[COLUMNS_TEST].head(34)

In [12]:
# PYTHON
#Loss not normal, everything else normal (FT)
#BLEU not normal, everything else normal (ZS)

#LOSS, BLEU wilcoxon
#EVERYTHING ELSE PAIRED T TEST

# R
# BLEU not normal, wilcoxon, everything else paired T

# MATLAB everything normal, paired T

In [26]:
for column in COLUMNS_TEST:
    print(column)
    stat, p = shapiro(m_results_ft[column])
    print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably Normal')
    else:
        print('Probably not Normal')

eval_loss
stat=0.979, p=0.725
Probably Normal
eval_rouge1
stat=0.984, p=0.890
Probably Normal
eval_rouge2
stat=0.979, p=0.728
Probably Normal
eval_rougeL
stat=0.982, p=0.821
Probably Normal
eval_rougeLsum
stat=0.981, p=0.815
Probably Normal
eval_bleu
stat=0.976, p=0.636
Probably Normal


In [20]:
# PYTHON
for column in COLUMNS_TEST:
    print(column)
    if (column=='eval_loss' or column =='eval_bleu'):
        stat, p = wilcoxon(py_results_zs[column], py_results_ft[column])
        print('stat=%.3f, p=%.3f' % (stat, p))
        if p > 0.05:
            print('Probably the same distribution')
        else:
            print('Probably different distributions')
    else:

        stat, p = ttest_rel(py_results_zs[column], py_results_ft[column])
        print('stat=%.3f, p=%.3f' % (stat, p))
        if p > 0.05:
            print('Probably the same distribution')
        else:
            print('Probably different distributions')

eval_loss
stat=0.000, p=0.000
Probably different distributions
eval_rouge1
stat=-36.337, p=0.000
Probably different distributions
eval_rouge2
stat=-29.735, p=0.000
Probably different distributions
eval_rougeL
stat=-33.685, p=0.000
Probably different distributions
eval_rougeLsum
stat=-33.208, p=0.000
Probably different distributions
eval_bleu
stat=0.000, p=0.000
Probably different distributions


In [27]:
# R
for column in COLUMNS_TEST:
    print(column)
    if (column =='eval_bleu'):
        stat, p = wilcoxon(r_results_zs[column], r_results_ft[column])
        print('stat=%.3f, p=%.3f' % (stat, p))
        if p > 0.05:
            print('Probably the same distribution')
        else:
            print('Probably different distributions')
    else:

        stat, p = ttest_rel(r_results_zs[column], r_results_ft[column])
        print('stat=%.3f, p=%.3f' % (stat, p))
        if p > 0.05:
            print('Probably the same distribution')
        else:
            print('Probably different distributions')

eval_loss
stat=114.689, p=0.000
Probably different distributions
eval_rouge1
stat=-80.322, p=0.000
Probably different distributions
eval_rouge2
stat=-63.470, p=0.000
Probably different distributions
eval_rougeL
stat=-77.557, p=0.000
Probably different distributions
eval_rougeLsum
stat=-74.711, p=0.000
Probably different distributions
eval_bleu
stat=0.000, p=0.000
Probably different distributions


In [28]:
# MATLAB
for column in COLUMNS_TEST:
    print(column)

    stat, p = ttest_rel(m_results_zs[column], m_results_ft[column])
    print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably the same distribution')
    else:
        print('Probably different distributions')

eval_loss
stat=65.289, p=0.000
Probably different distributions
eval_rouge1
stat=-42.734, p=0.000
Probably different distributions
eval_rouge2
stat=-34.983, p=0.000
Probably different distributions
eval_rougeL
stat=-40.535, p=0.000
Probably different distributions
eval_rougeLsum
stat=-40.374, p=0.000
Probably different distributions
eval_bleu
stat=-21.728, p=0.000
Probably different distributions


In [40]:
# ALL DIFFS ARE NORMAL
for column in COLUMNS_TEST:
    print(column)
    py_diff = abs(py_results_ft[column] - py_results_zs[column])
    r_diff = abs(r_results_ft[column] - r_results_zs[column])
    m_diff = abs(m_results_ft[column] - m_results_zs[column])

    stat, p = ttest_rel(m_diff, r_diff, alternative='less')
    print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably not less')
    else:
        print('Probably less')

eval_loss
stat=-0.115, p=0.455
Probably not less
eval_rouge1
stat=-9.063, p=0.000
Probably less
eval_rouge2
stat=-6.636, p=0.000
Probably less
eval_rougeL
stat=-8.820, p=0.000
Probably less
eval_rougeLsum
stat=-8.852, p=0.000
Probably less
eval_bleu
stat=-2.187, p=0.018
Probably less
