# Statistical Analysis

## Libs

In [1]:
import json

import pandas as pd

from cliffs_delta import cliffs_delta
from scipy.stats import wilcoxon, ttest_rel, pearsonr, spearmanr, kendalltau

## Utils

In [2]:
def do_stat_analysis(original_df, human_df, llm_df):
    results = []

    original = original_df['gulpease']
    human = human_df['gulpease']
    llm = llm_df['gulpease']
    results.append({
        "metrics": "gulpease",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    original = original_df['flesch_vacca']
    human = human_df['flesch_vacca']
    llm = llm_df['flesch_vacca']
    results.append({
        "metrics": "flesch_vacca",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    original = (original_df['n_vdb'] / original_df['n_tokens'] * 100.0).fillna(0)
    human = (human_df['n_vdb'] / human_df['n_tokens'] * 100.0).fillna(0)
    llm = (llm_df['n_vdb'] / llm_df['n_tokens'] * 100.0).fillna(0)
    results.append({
        "metrics": "vdb",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    original = (original_df['n_passive_verbs'] / original_df['n_verbs'] * 100.0).fillna(0)
    human= (human_df['n_passive_verbs'] / human_df['n_verbs'] * 100.0).fillna(0)
    llm = (llm_df['n_passive_verbs'] / llm_df['n_verbs'] * 100.0).fillna(0)
    results.append({
        "metrics": "passive",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    original = original_df['readit_base']
    human = human_df['readit_base']
    llm = llm_df['readit_base']
    results.append({
        "metrics": "readit_base",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    original = original_df['readit_lexical']
    human = human_df['readit_lexical']
    llm = llm_df['readit_lexical']
    results.append({
        "metrics": "readit_lexical",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })
    
    original = original_df['readit_syntactic']
    human = human_df['readit_syntactic']
    llm = llm_df['readit_syntactic']
    results.append({
        "metrics": "readit_syntactic",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    original = original_df['readit_global']
    human = human_df['readit_global']
    llm = llm_df['readit_global']
    results.append({
        "metrics": "readit_global",
        "original_mean": round(original.mean(), 2),
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        # "p_value": wilcoxon(human - original, llm - original)[1],
        # "p_value_ok": wilcoxon(human - original, llm - original)[1] < 0.05,
        # "eff_size": cliffs_delta(human - original, llm - original)[1],
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if (llm - original).mean() > (human - original).mean() else 'negative'
    })

    human = (human_df['semantic_similarity']).fillna(0)
    llm = (llm_df['semantic_similarity']).fillna(0)
    results.append({
        "metrics": "semantic_similarity",
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if llm.mean() > human.mean() else 'negative'
    })

    human = human_df['editdistance'] / pd.concat([original_df['n_chars'], human_df['n_chars']], axis=1).max(axis=1) * 100
    llm = llm_df['editdistance'] / pd.concat([original_df['n_chars'], llm_df['n_chars']], axis=1).max(axis=1) * 100
    results.append({
        "metrics": "editdistance",
        "human_mean": round(human.mean(), 2),
        "llm_mean": round(llm.mean(), 2),
        "p_value": wilcoxon(human, llm)[1],
        "p_value_ok": wilcoxon(human, llm)[1] < 0.05,
        "eff_size": cliffs_delta(human, llm)[1],
        "eff": 'positive' if llm.mean() > human.mean() else 'negative'
    })

    return pd.DataFrame(results)

## Load data

In [3]:
CORPUS_NAMES = ['original', 'human1', 'human2', 'gpt3_5', 'gpt4', 'llama3', 'phi3']

In [4]:
dfs_maps = dict()
jsons_maps = dict()
for CORPUS_NAME in CORPUS_NAMES:
  print(CORPUS_NAME)
  tmp_df = pd.read_csv(f'simplified_corpora_with_metrics/{CORPUS_NAME}.csv', encoding='utf-8')
  tmp_df = tmp_df.sort_values(by=['document', 'paragraph_index'])
  print(tmp_df.shape)
  dfs_maps[CORPUS_NAME] = tmp_df

original
(619, 41)
human1
(619, 48)
human2
(619, 48)
gpt3_5
(619, 48)
gpt4
(619, 48)
llama3
(619, 48)
phi3
(619, 48)


In [5]:
def corr_analysis(original_df, simpl_df):
    human_s = dfs_maps['human1']['semantic_similarity']
    human_e = dfs_maps['human1']['editdistance'] / pd.concat([dfs_maps['original']['n_chars'], dfs_maps['human1']['n_chars']], axis=1).max(axis=1) * 100
    kendalltau(human_s, human_e)

## GPT-4

In [6]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human1'], dfs_maps['gpt4'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,49.72,51.34,6.488534e-10,True,negligible,positive
1,flesch_vacca,19.97,34.23,36.75,9.618789e-05,True,negligible,positive
2,vdb,73.24,80.44,81.07,0.0107843,True,negligible,positive
3,passive,20.87,15.78,12.0,0.0004491658,True,negligible,negative
4,readit_base,0.76,0.69,0.55,2.192894e-24,True,small,negative
5,readit_lexical,0.94,0.85,0.9,3.183687e-08,True,negligible,positive
6,readit_syntactic,0.64,0.53,0.3,1.469038e-29,True,small,negative
7,readit_global,0.86,0.69,0.55,3.151567e-13,True,small,negative
8,semantic_similarity,,96.52,95.8,3.20218e-12,True,small,negative
9,editdistance,,35.84,52.14,3.703462e-60,True,large,positive


In [7]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human2'], dfs_maps['gpt4'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,50.64,51.34,0.009239853,True,negligible,positive
1,flesch_vacca,19.97,33.63,36.75,7.146294e-06,True,negligible,positive
2,vdb,73.24,76.89,81.07,1.656554e-34,True,small,positive
3,passive,20.87,17.71,12.0,1.079617e-05,True,negligible,negative
4,readit_base,0.76,0.51,0.55,0.0292043,True,negligible,positive
5,readit_lexical,0.94,0.9,0.9,0.339076,False,negligible,positive
6,readit_syntactic,0.64,0.4,0.3,2.468327e-09,True,negligible,negative
7,readit_global,0.86,0.61,0.55,6.994563e-05,True,negligible,negative
8,semantic_similarity,,97.26,95.8,1.96474e-37,True,medium,negative
9,editdistance,,29.2,52.14,2.352919e-82,True,large,positive


## GPT3

In [8]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human1'], dfs_maps['gpt3_5'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,49.72,48.49,7.291712e-06,True,negligible,negative
1,flesch_vacca,19.97,34.23,30.33,3.425211e-09,True,negligible,negative
2,vdb,73.24,80.44,78.28,3.70343e-11,True,negligible,negative
3,passive,20.87,15.78,13.99,0.1065951,False,negligible,negative
4,readit_base,0.76,0.69,0.67,0.00520626,True,negligible,negative
5,readit_lexical,0.94,0.85,0.92,1.029182e-15,True,negligible,positive
6,readit_syntactic,0.64,0.53,0.38,8.293399000000001e-17,True,small,negative
7,readit_global,0.86,0.69,0.69,0.4434029,False,negligible,negative
8,semantic_similarity,,96.52,96.06,5.769499e-08,True,small,negative
9,editdistance,,35.84,49.21,3.169896e-46,True,medium,positive


In [9]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human2'], dfs_maps['gpt3_5'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,50.64,48.49,6.721741e-10,True,small,negative
1,flesch_vacca,19.97,33.63,30.33,3.103557e-07,True,negligible,negative
2,vdb,73.24,76.89,78.28,2.770521e-05,True,negligible,positive
3,passive,20.87,17.71,13.99,0.007295083,True,negligible,negative
4,readit_base,0.76,0.51,0.67,2.442149e-20,True,small,positive
5,readit_lexical,0.94,0.9,0.92,0.009144701,True,negligible,positive
6,readit_syntactic,0.64,0.4,0.38,0.1507124,False,negligible,negative
7,readit_global,0.86,0.61,0.69,0.0003243806,True,negligible,positive
8,semantic_similarity,,97.26,96.06,2.902262e-25,True,medium,negative
9,editdistance,,29.2,49.21,9.469369999999999e-70,True,large,positive


## LLAMA

In [10]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human1'], dfs_maps['llama3'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,49.72,50.26,0.007797009,True,negligible,positive
1,flesch_vacca,19.97,34.23,34.09,0.7133791,False,negligible,negative
2,vdb,73.24,80.44,80.18,0.4700749,False,negligible,negative
3,passive,20.87,15.78,15.81,0.7221885,False,negligible,positive
4,readit_base,0.76,0.69,0.58,1.4203250000000002e-17,True,small,negative
5,readit_lexical,0.94,0.85,0.77,2.034657e-05,True,negligible,negative
6,readit_syntactic,0.64,0.53,0.41,1.111975e-08,True,small,negative
7,readit_global,0.86,0.69,0.59,5.786003e-08,True,small,negative
8,semantic_similarity,,96.52,94.96,7.248210000000001e-27,True,medium,negative
9,editdistance,,35.84,55.48,8.082642999999999e-63,True,large,positive


In [11]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human2'], dfs_maps['llama3'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,50.64,50.26,0.5739604,False,negligible,negative
1,flesch_vacca,19.97,33.63,34.09,0.4316865,False,negligible,positive
2,vdb,73.24,76.89,80.18,7.187038999999999e-19,True,small,positive
3,passive,20.87,17.71,15.81,0.1313919,False,negligible,negative
4,readit_base,0.76,0.51,0.58,7.322538e-06,True,negligible,positive
5,readit_lexical,0.94,0.9,0.77,3.270669e-15,True,small,negative
6,readit_syntactic,0.64,0.4,0.41,0.4777445,False,negligible,positive
7,readit_global,0.86,0.61,0.59,0.2693113,False,negligible,negative
8,semantic_similarity,,97.26,94.96,2.3224229999999996e-50,True,large,negative
9,editdistance,,29.2,55.48,3.1311359999999996e-85,True,large,positive


## Phi3

In [12]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human1'], dfs_maps['phi3'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,49.72,50.16,0.01344847,True,negligible,positive
1,flesch_vacca,19.97,34.23,33.75,0.3913985,False,negligible,negative
2,vdb,73.24,80.44,80.16,0.5135307,False,negligible,negative
3,passive,20.87,15.78,15.72,0.7426201,False,negligible,negative
4,readit_base,0.76,0.69,0.58,1.3069240000000002e-17,True,small,negative
5,readit_lexical,0.94,0.85,0.76,1.171692e-07,True,negligible,negative
6,readit_syntactic,0.64,0.53,0.41,1.360289e-08,True,small,negative
7,readit_global,0.86,0.69,0.58,5.572763e-08,True,small,negative
8,semantic_similarity,,96.52,94.96,4.993544e-27,True,medium,negative
9,editdistance,,35.84,55.44,1.5041929999999999e-63,True,large,positive


In [13]:
do_stat_analysis(dfs_maps['original'], dfs_maps['human2'], dfs_maps['phi3'])

Unnamed: 0,metrics,original_mean,human_mean,llm_mean,p_value,p_value_ok,eff_size,eff
0,gulpease,44.31,50.64,50.16,0.4153815,False,negligible,negative
1,flesch_vacca,19.97,33.63,33.75,0.8484467,False,negligible,positive
2,vdb,73.24,76.89,80.16,1.164671e-18,True,small,positive
3,passive,20.87,17.71,15.72,0.1063253,False,negligible,negative
4,readit_base,0.76,0.51,0.58,2.587388e-05,True,negligible,positive
5,readit_lexical,0.94,0.9,0.76,3.038695e-18,True,small,negative
6,readit_syntactic,0.64,0.4,0.41,0.19452,False,negligible,positive
7,readit_global,0.86,0.61,0.58,0.1770465,False,negligible,negative
8,semantic_similarity,,97.26,94.96,1.1476059999999999e-50,True,large,negative
9,editdistance,,29.2,55.44,3.00374e-85,True,large,positive


# Correlation (semantic_similarity vs editdistance)

In [14]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
    if CORPUS_NAME == 'original':
        continue
    s = df['semantic_similarity']
    e = df['editdistance'] / pd.concat([dfs_maps['original']['n_chars'], df['n_chars']], axis=1).max(axis=1) * 100
    d.append({
        'corpus': CORPUS_NAME,
        'pearsonr': pearsonr(s, e)[0],
        'pearsonr_ok': pearsonr(s, e)[1] < 0.05,
        'spearmanr': spearmanr(s, e)[0],
        'spearmanr_ok': spearmanr(s, e)[1] < 0.05,
        'kendalltau': kendalltau(s, e)[0],
        'kendalltau_ok': kendalltau(s, e)[1] < 0.05,
    })

pd.DataFrame(d).head(20)

Unnamed: 0,corpus,pearsonr,pearsonr_ok,spearmanr,spearmanr_ok,kendalltau,kendalltau_ok
0,human1,-0.780257,True,-0.81779,True,-0.628587,True
1,human2,-0.815397,True,-0.854619,True,-0.679401,True
2,gpt3_5,-0.529278,True,-0.54744,True,-0.381988,True
3,gpt4,-0.555168,True,-0.529365,True,-0.374514,True
4,llama3,-0.714993,True,-0.659249,True,-0.481106,True
5,phi3,-0.719746,True,-0.661604,True,-0.483106,True
