In [None]:
from google.colab import drive
import pandas as pd
from scipy.stats import ttest_rel

# Mount Google Drive
drive.mount('/content/drive')

def load_and_suffix(filepath, suffix):
    """
    Loads a dataset and renames its columns with a given suffix,
    excluding specified key columns and removing unnamed columns.
    """
    df = pd.read_csv(filepath)
    df = df.rename(columns={col: f"{col}{suffix}" for col in df.columns if col not in ['Dutch', 'Croatian']})
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

def merge_dfs(dataframes):
    """
    Merges a list of dataframes on 'Dutch' and 'Croatian' columns using an outer join.
    """
    final_df = dataframes[0]
    for df in dataframes[1:]:
        final_df = pd.merge(final_df, df, on=['Dutch', 'Croatian'], how='outer')
    return final_df

def calculate_significance(df, column1, column2):
    """
    Calculates the p-value for the difference between two sets of scores (e.g., BLEU)
    across models or fine-tuning stages using a paired t-test.
    """
    scores1 = df[column1].dropna()
    scores2 = df[column2].dropna()
    stat, p_value = ttest_rel(scores1, scores2)
    return p_value

# Define file paths and suffixes
pretrained_paths_suffixes = [
    ('/content/drive/MyDrive/m2m100_418M_evaluation_1.csv', '_pretrained_m2m100'),
    ('/content/drive/MyDrive/mbart50_large_evaluation_1.csv', '_pretrained_mbart50'),
    ('/content/drive/MyDrive/nllb200_distilled_600m_evaluation_1.csv', '_pretrained_nllb200')
]

public_paths_suffixes = [
    ('/content/drive/MyDrive/evaluation_df_m2m100_1_hr_nl.csv', '_public_hr_nl_m2m100'),
    ('/content/drive/MyDrive/evaluation_df_m2m100_1_nl_hr.csv', '_public_nl_hr_m2m100'),
    ('/content/drive/MyDrive/evaluation_df_mbart50_1_hr_nl.csv', '_public_hr_nl_mbart50'),
    ('/content/drive/MyDrive/evaluation_df_mbart50_1_nl_hr.csv', '_public_nl_hr_mbart50'),
    ('/content/drive/MyDrive/evaluation_df_nllb200_1_hr_nl.csv', '_public_hr_nl_nllb200'),
    ('/content/drive/MyDrive/evaluation_df_nllb200_1_nl_hr.csv', '_public_nl_hr_nllb200')
]

dict_paths_suffixes = [
    ('/content/drive/MyDrive/evaluation_df_nllb200_hr_nl.csv', '_dict_hr_nl_nllb200'),
    ('/content/drive/MyDrive/evaluation_df_nllb200_nl_hr.csv', '_dict_nl_hr_nllb200'),
    ('/content/drive/MyDrive/evaluation_df_m2m100_hr_nl.csv', '_dict_hr_nl_m2m100'),
    ('/content/drive/MyDrive/evaluation_df_m2m100_nl_hr.csv', '_dict_nl_hr_m2m100'),
    ('/content/drive/MyDrive/evaluation_df_mbart50_hr_nl.csv', '_dict_hr_nl_mbart50'),
    ('/content/drive/MyDrive/evaluation_df_mbart50_nl_hr.csv', '_dict_nl_hr_mbart50')
]

# Load, process datasets and merge them
pretrained_dfs = [load_and_suffix(path, suffix) for path, suffix in pretrained_paths_suffixes]
public_dfs = [load_and_suffix(path, suffix) for path, suffix in public_paths_suffixes]
dict_dfs = [load_and_suffix(path, suffix) for path, suffix in dict_paths_suffixes]

final_pretrained_df = merge_dfs(pretrained_dfs)
final_public_df = merge_dfs(public_dfs)
final_dict_df = merge_dfs(dict_dfs)

Mounted at /content/drive


In [None]:
final_public_df

Unnamed: 0,Dutch,Croatian,Translated Dutch_public_hr_nl_m2m100,Croatian to Dutch BLEU_public_hr_nl_m2m100,Croatian to Dutch METEOR_public_hr_nl_m2m100,Croatian to Dutch COMET_public_hr_nl_m2m100,Translated Croatian_public_nl_hr_m2m100,Dutch to Croatian BLEU_public_nl_hr_m2m100,Dutch to Croatian METEOR_public_nl_hr_m2m100,Dutch to Croatian COMET_public_nl_hr_m2m100,...,Dutch to Croatian METEOR_public_nl_hr_mbart50,Dutch to Croatian COMET_public_nl_hr_mbart50,Translated Dutch_public_hr_nl_nllb200,Croatian to Dutch BLEU_public_hr_nl_nllb200,Croatian to Dutch METEOR_public_hr_nl_nllb200,Croatian to Dutch COMET_public_hr_nl_nllb200,Translated Croatian_public_nl_hr_nllb200,Dutch to Croatian BLEU_public_nl_hr_nllb200,Dutch to Croatian METEOR_public_nl_hr_nllb200,Dutch to Croatian COMET_public_nl_hr_nllb200
0,Op maandag kondigden wetenschappers van de Sta...,U ponedjeljak su znanstvenici s Medicinskog fa...,Op maandag kondigden wetenschappers van de Uni...,20.552056,0.476565,0.82526,U ponedjeljak su znanstvenici sa Sveučilišta S...,8.280336,0.296606,0.61391,...,0.460379,0.77165,Op maandag kondigden wetenschappers van de Sta...,32.383141,0.566763,0.81268,U ponedjeljak su znanstvenici iz Medicinskog f...,10.604432,0.332517,0.78708
1,Hoofdonderzoekers zeggen dat dit kan leiden to...,Voditelji istraživanja izjavili su da bi ovo o...,Onderzoekers zeiden dat deze ontdekking het mo...,18.530390,0.249206,0.84314,Glavni istraživači kažu da to može dovesti do ...,5.341814,0.222033,0.65269,...,0.410115,0.90761,De onderzoekers zeiden dat deze ontdekking het...,18.566862,0.260800,0.83319,Glavni istra3 4ivai kažu da to može dovesti do...,5.818364,0.276198,0.79709
2,De JAS 39C Gripen stortte rond 09.30 uur lokal...,JAS 39C Gripen srušio se na pistu i eksplodira...,JAS 39C Gripen crashte op de baan en explodeer...,13.181313,0.504386,0.78538,JAS 39C Gripen srušio se oko 09 30 lokalnog vr...,34.854268,0.605390,0.82764,...,0.646885,0.90553,JAS 39C Gripen crashte op het spoor en explode...,21.098000,0.595705,0.77039,Zračna luka JAS 39C Gripen srušila se oko 9. 3...,14.021469,0.472580,0.73865
3,De piloot werd geïdentificeerd als majoor Dilo...,Pilot je identificiran kao zapovjednik eskadri...,De piloot werd geïdentificeerd als de commanda...,33.260250,0.715371,0.86630,Pilot je identificiran kao majstor Dilokrit Pa...,44.124845,0.777345,0.83906,...,0.777345,0.83906,De piloot werd geïdentificeerd als de commanda...,38.677063,0.828624,0.89209,Pilot je identificiran kao major Dilokrit Patt...,44.124845,0.777345,0.87818
4,De lokale media meldt dat er tijdens een actie...,Lokalni mediji izvješćuju da je došlo do prevr...,Lokale media melden dat er een brandweervoertu...,10.079037,0.430058,0.74734,Lokalni mediji izvješćuju da je tijekom akcije...,19.468125,0.304054,0.53111,...,0.170608,0.73401,Plaatselijke media rapporteren dat er een over...,4.885327,0.228571,0.64607,Lokalni mediji izvijestili su da je tijekom ak...,6.019767,0.200000,0.60034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221,Mijn naam is Andrea.,Moje ime je Andrea.,Mijn naam is Andrea.,100.000000,0.996000,0.99040,Moje ime je Andrea.,100.000000,0.996000,0.99322,...,0.996000,0.99322,Mijn naam is Andrea.,100.000000,0.996000,0.99040,Moje ime je Andrea.,100.000000,0.996000,0.99322
2222,Jupiter is een planeet.,Jupiter je planet.,Jupiter is een planeet.,100.000000,0.996000,0.98417,Jupiter je planet.,100.000000,0.992188,0.98727,...,0.992188,0.98727,Jupiter is een planeet,77.880078,0.809949,0.97894,Jupiter je planet.,100.000000,0.992188,0.98727
2223,Venus is een planeet.,Venera je planet.,Venus is een planeet.,100.000000,0.996000,0.98203,Venera je planet.,100.000000,0.992188,0.98659,...,0.992188,0.98659,Venus is een planeet.,100.000000,0.996000,0.98203,Venera je planet.,100.000000,0.992188,0.98659
2224,God is een olifant.,Bog je slon.,God is een olifant.,100.000000,0.996000,0.97846,Bog je slon.,100.000000,0.992188,0.98735,...,0.992188,0.98735,God is een olifant.,100.000000,0.996000,0.97846,Bog je slon.,100.000000,0.992188,0.98735


In [None]:
import pandas as pd
import scipy.stats as stats

def prepare_scores_for_comparison(df1, df2, column_name_df1, column_name_df2):
    """
    Prepares scores from two dataframes for comparison, ensuring alignment and removing NaN values.
    """
    merged_df = pd.merge(
        df1[['Dutch', 'Croatian', column_name_df1]],
        df2[['Dutch', 'Croatian', column_name_df2]],
        on=['Dutch', 'Croatian'],
        how='inner'
    )
    merged_df.dropna(subset=[column_name_df1, column_name_df2], inplace=True)
    return merged_df[column_name_df1], merged_df[column_name_df2]

def calculate_significance_stats(scores1, scores2):
    """
    Calculates the p-value for the difference between two sets of scores using a paired t-test with scipy.stats.
    Sets the alternative to 'two-sided' for a two-tailed t-test.
    """
    _, p_value = stats.ttest_rel(scores1, scores2, alternative='two-sided')
    return p_value

model_names = ['mbart50', 'm2m100', 'nllb200']
metrics = ['BLEU', 'METEOR', 'COMET']
directions = [('Croatian to Dutch', 'hr_nl'), ('Dutch to Croatian', 'nl_hr')]

In [None]:
results = []

for model in model_names:
    for metric in metrics:
        for direction, suffix in directions:
            column_name_pretrained = f"{direction} {metric}_pretrained_{model}"
            column_name_public = f"{direction} {metric}_public_{suffix}_{model}"
            if column_name_pretrained not in final_pretrained_df.columns or column_name_public not in final_public_df.columns:
                continue
            scores_pretrained, scores_public = prepare_scores_for_comparison(
                final_pretrained_df,
                final_public_df,
                column_name_pretrained,
                column_name_public
            )
            if len(scores_pretrained) > 0 and len(scores_public) > 0:
                p_value = calculate_significance_stats(scores_pretrained, scores_public)
                mean_difference = scores_public.mean() - scores_pretrained.mean()
                result = {
                    "Comparison": f"{metric} {direction[0]} ({model})",
                    "Mean Difference": mean_difference,
                    "P-value": p_value,
                    "Change": "improved" if mean_difference > 0 else "worsened" if mean_difference < 0 else "remained the same",
                    "Significance": "statistically significant" if p_value < 0.05 else "not statistically significant"
                }
                results.append(result)

for result in results:
    print(f"{result['Comparison']} comparison (Pretrained -> Public): There was {result['Change']} after fine-tuning with the public dataset "
          f"(Mean difference = {result['Mean Difference']:.3f}, P-value = {result['P-value']:.3f}). "
          f"This change is {result['Significance']} at the 5% level (SciPy).")

BLEU C (mbart50) comparison (Pretrained -> Public): There was improved after fine-tuning with the public dataset (Mean difference = 10.392, P-value = 0.000). This change is statistically significant at the 5% level (SciPy).
BLEU D (mbart50) comparison (Pretrained -> Public): There was improved after fine-tuning with the public dataset (Mean difference = 13.008, P-value = 0.000). This change is statistically significant at the 5% level (SciPy).
METEOR C (mbart50) comparison (Pretrained -> Public): There was improved after fine-tuning with the public dataset (Mean difference = 0.196, P-value = 0.000). This change is statistically significant at the 5% level (SciPy).
METEOR D (mbart50) comparison (Pretrained -> Public): There was improved after fine-tuning with the public dataset (Mean difference = 0.302, P-value = 0.000). This change is statistically significant at the 5% level (SciPy).
COMET C (mbart50) comparison (Pretrained -> Public): There was improved after fine-tuning with the pub

In [None]:
results = []

for model in model_names:
    for metric in metrics:
        for direction, suffix in directions:
            column_name_public = f"{direction} {metric}_public_{suffix}_{model}"
            column_name_dict = f"{direction} {metric}_dict_{suffix}_{model}"
            if column_name_public not in final_public_df.columns or column_name_dict not in final_dict_df.columns:
                continue
            scores_public, scores_dict = prepare_scores_for_comparison(
                final_public_df,
                final_dict_df,
                column_name_public,
                column_name_dict
            )
            if not scores_public.empty and not scores_dict.empty:
                p_value_stats = calculate_significance_stats(scores_public, scores_dict)
                mean_difference = scores_dict.mean() - scores_public.mean()
                result = {
                    "Comparison": f"{metric} {direction[0]} ({model})",
                    "Mean Difference": mean_difference,
                    "P-value (SciPy)": p_value_stats
                }
                results.append(result)

for result in results:
    if result["Mean Difference"] > 0:
        change_description = "improved"
    elif result["Mean Difference"] < 0:
        change_description = "worsened"
    else:
        change_description = "no improvement"
    significance_stats = "statistically significant" if result["P-value (SciPy)"] < 0.05 else "not statistically significant"
    print(f"{result['Comparison']} comparison (Dict -> Public): There was {change_description} after further fine-tuning with the dictionary "
          f"(Mean difference = {result['Mean Difference']:.3f}, P-value = {result['P-value (SciPy)']:.3f}). This change is {significance_stats} at the 5% level (SciPy).")

BLEU C (mbart50) comparison (Dict -> Public): There was improved after further fine-tuning with the dictionary (Mean difference = 0.027, P-value = 0.385). This change is not statistically significant at the 5% level (SciPy).
BLEU D (mbart50) comparison (Dict -> Public): There was improved after further fine-tuning with the dictionary (Mean difference = 0.046, P-value = 0.103). This change is not statistically significant at the 5% level (SciPy).
METEOR C (mbart50) comparison (Dict -> Public): There was improved after further fine-tuning with the dictionary (Mean difference = 0.001, P-value = 0.127). This change is not statistically significant at the 5% level (SciPy).
METEOR D (mbart50) comparison (Dict -> Public): There was improved after further fine-tuning with the dictionary (Mean difference = 0.001, P-value = 0.137). This change is not statistically significant at the 5% level (SciPy).
COMET C (mbart50) comparison (Dict -> Public): There was improved after further fine-tuning with