# Metrics stats

## Libs

In [1]:
import json

import pandas as pd

from utils import loaders

from cliffs_delta import cliffs_delta
from scipy.stats import wilcoxon

# Utils

In [2]:
def do_stat_analysis(original_df, df1, df2, names):
    results = []

    original = original_df['n_sentences']
    llm1 = df1['n_sentences']
    llm2 = df2['n_sentences']
    results.append({
        "metrics": "sentences",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean()> (llm2 - original).mean() else '-'
    })

    original = original_df['gulpease']
    llm1 = df1['gulpease']
    llm2 = df2['gulpease']
    results.append({
        "metrics": "gulpease",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean()> (llm2 - original).mean() else '-'
    })

    original = original_df['flesch_vacca']
    llm1 = df1['flesch_vacca']
    llm2 = df2['flesch_vacca']
    results.append({
        "metrics": "flesch_vacca",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() > (llm2 - original).mean() else '-'
    })

    original = (original_df['n_vdb'] / original_df['n_tokens'] * 100.0).fillna(0)
    llm1 = (df1['n_vdb'] / df1['n_tokens'] * 100.0).fillna(0)
    llm2 = (df2['n_vdb'] / df2['n_tokens'] * 100.0).fillna(0)
    results.append({
        "metrics": "vdb",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() > (llm2 - original).mean()  else '-'
    })

    original = (original_df['n_passive_verbs'] / original_df['n_verbs'] * 100.0).fillna(0)
    llm1= (df1['n_passive_verbs'] / df1['n_verbs'] * 100.0).fillna(0)
    llm2 = (df2['n_passive_verbs'] / df2['n_verbs'] * 100.0).fillna(0)
    results.append({
        "metrics": "passive",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() < (llm2 - original).mean()  else '-'
    })

    original = original_df['readit_base']
    llm1 = df1['readit_base']
    llm2 = df2['readit_base']
    results.append({
        "metrics": "readit_base",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() < (llm2 - original).mean()  else '-'
    })

    original = original_df['readit_lexical']
    llm1 = df1['readit_lexical']
    llm2 = df2['readit_lexical']
    results.append({
        "metrics": "readit_lexical",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() < (llm2 - original).mean()  else '-'
    })
    
    original = original_df['readit_syntactic']
    llm1 = df1['readit_syntactic']
    llm2 = df2['readit_syntactic']
    results.append({
        "metrics": "readit_syntactic",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() < (llm2 - original).mean()  else '-'
    })

    original = original_df['readit_global']
    llm1 = df1['readit_global']
    llm2 = df2['readit_global']
    results.append({
        "metrics": "readit_global",
        f"{names[0]}_mean": round(original.mean(), 2),
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() < (llm2 - original).mean()  else '-'
    })

    llm1 = (df1['semantic_similarity']).fillna(0)
    llm2 = (df2['semantic_similarity']).fillna(0)
    results.append({
        "metrics": "semantic_similarity",
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if llm1.mean() > llm2.mean() else '-'
    })

    llm1 = df1['editdistance'] / pd.concat([original_df['n_chars'], df1['n_chars']], axis=1).max(axis=1) * 100
    llm2 = df2['editdistance'] / pd.concat([original_df['n_chars'], df2['n_chars']], axis=1).max(axis=1) * 100
    results.append({
        "metrics": "editdistance",
        f"{names[1]}_mean": round(llm1.mean(), 2),
        f"{names[2]}_mean": round(llm2.mean(), 2),
        "p_value": wilcoxon(llm1, llm2)[1],
        "p_value_ok": wilcoxon(llm1, llm2)[1] < 0.05,
        "eff_size": cliffs_delta(llm1, llm2)[1],
        "eff": '+' if (llm1 - original).mean() < (llm2 - original).mean()  else '-'
    })

    return pd.DataFrame(results)

In [3]:
df_o = pd.read_csv(f'corpora_with_metrics/original.csv', encoding='utf-8')

df_basic = pd.read_csv(f'corpora_with_metrics/basic.csv', encoding='utf-8')
df_basic_mini = pd.read_csv(f'corpora_with_metrics/mini_basic.csv', encoding='utf-8')

df_chain = pd.read_csv(f'corpora_with_metrics/chain6.csv', encoding='utf-8')
df_chain_mini = pd.read_csv(f'corpora_with_metrics/mini_chain6.csv', encoding='utf-8')

# GPT-4o vs GPT-4o-mini

## Basic

In [4]:
do_stat_analysis(df_o, df_basic, df_basic_mini, names=['original', 'gpt-4o', 'gpt-4o-mini'])

Unnamed: 0,metrics,original_mean,gpt-4o_mean,gpt-4o-mini_mean,p_value,p_value_ok,eff_size,eff
0,sentences,9.79,10.42,11.32,0.002746,True,negligible,-
1,gulpease,44.69,53.56,52.91,0.045901,True,negligible,+
2,flesch_vacca,25.0,41.28,39.01,0.013682,True,negligible,+
3,vdb,72.42,80.65,79.01,0.00374,True,small,+
4,passive,26.38,13.91,20.29,0.003334,True,small,+
5,readit_base,0.77,0.45,0.45,0.883866,False,negligible,+
6,readit_lexical,0.74,0.55,0.52,0.411912,False,negligible,-
7,readit_syntactic,0.83,0.5,0.53,0.409448,False,negligible,+
8,readit_global,0.92,0.45,0.47,0.632762,False,negligible,+
9,semantic_similarity,,95.49,96.04,0.008016,True,small,-


## Chain

In [5]:
do_stat_analysis(df_o, df_chain, df_chain_mini, names=['original', 'gpt-4o', 'gpt-4o-mini'])

Unnamed: 0,metrics,original_mean,gpt-4o_mean,gpt-4o-mini_mean,p_value,p_value_ok,eff_size,eff
0,sentences,9.79,11.91,13.0,0.0003334447,True,negligible,-
1,gulpease,44.69,48.39,49.35,0.02110252,True,negligible,-
2,flesch_vacca,25.0,32.21,33.51,0.0290943,True,negligible,-
3,vdb,72.42,76.47,75.35,0.002095618,True,negligible,+
4,passive,26.38,8.17,13.87,0.0008103349,True,small,+
5,readit_base,0.77,0.66,0.64,0.7258667,False,negligible,-
6,readit_lexical,0.74,0.65,0.63,0.8190306,False,negligible,-
7,readit_syntactic,0.83,0.54,0.59,0.1239259,False,negligible,+
8,readit_global,0.92,0.56,0.59,0.58213,False,negligible,+
9,semantic_similarity,,98.52,98.83,0.0003091597,True,medium,-


# Chain VS Basic

## GPT-4o

In [6]:
do_stat_analysis(df_o, df_chain, df_basic, names=['original', 'chain', 'basic'])

Unnamed: 0,metrics,original_mean,chain_mean,basic_mean,p_value,p_value_ok,eff_size,eff
0,sentences,9.79,11.91,10.42,0.000170609,True,negligible,+
1,gulpease,44.69,48.39,53.56,1.350765e-07,True,large,-
2,flesch_vacca,25.0,32.21,41.28,4.151387e-08,True,large,-
3,vdb,72.42,76.47,80.65,5.327344e-08,True,medium,-
4,passive,26.38,8.17,13.91,0.006414093,True,small,+
5,readit_base,0.77,0.66,0.45,1.867911e-06,True,medium,-
6,readit_lexical,0.74,0.65,0.55,0.03827949,True,negligible,-
7,readit_syntactic,0.83,0.54,0.5,0.2852611,False,negligible,-
8,readit_global,0.92,0.56,0.45,0.01925909,True,small,-
9,semantic_similarity,,98.52,95.49,2.83405e-10,True,large,+


## GPT-4o-mini

In [7]:
do_stat_analysis(df_o, df_chain_mini, df_basic_mini, names=['original', 'chain', 'basic'])

Unnamed: 0,metrics,original_mean,chain_mean,basic_mean,p_value,p_value_ok,eff_size,eff
0,sentences,9.79,13.0,11.32,0.0004790959,True,negligible,+
1,gulpease,44.69,49.35,52.91,4.744986e-05,True,medium,-
2,flesch_vacca,25.0,33.51,39.01,4.567974e-05,True,medium,-
3,vdb,72.42,75.35,79.01,1.254128e-06,True,medium,-
4,passive,26.38,13.87,20.29,0.01040713,True,small,+
5,readit_base,0.77,0.64,0.45,7.638538e-06,True,medium,-
6,readit_lexical,0.74,0.63,0.52,0.008511915,True,small,-
7,readit_syntactic,0.83,0.59,0.53,0.2155014,False,negligible,-
8,readit_global,0.92,0.59,0.47,0.01440084,True,small,-
9,semantic_similarity,,98.83,96.04,6.263374e-10,True,large,+
