# Metrics overview

## Libs

In [1]:
import json

import pandas as pd

## Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load data

In [3]:
CORPUS_NAMES = ['original', 'human1', 'human2', 'gpt3_5', 'gpt4', 'llama3', 'phi3']

In [4]:
dfs_maps = dict()
jsons_maps = dict()
for CORPUS_NAME in CORPUS_NAMES:
  print(CORPUS_NAME)
  tmp_df = pd.read_csv(f'simplified_corpora_with_metrics/{CORPUS_NAME}.csv', encoding='utf-8')
  tmp_df = tmp_df.sort_values(by=['document', 'paragraph_index'])
  tmp_json = json.load(open(f'simplified_corpora_with_metrics/{CORPUS_NAME}.json', 'r', encoding='utf-8'))
  print(tmp_df.shape)
  dfs_maps[CORPUS_NAME] = tmp_df
  jsons_maps[CORPUS_NAME] = tmp_json

original
(619, 41)
human1
(619, 48)
human2
(619, 48)
gpt3_5
(619, 48)
gpt4
(619, 48)
llama3
(619, 48)
phi3
(619, 48)


## Basic

In [5]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus': CORPUS_NAME,
    'Tokens': df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri': df[f'n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe': df['n_syllables'].sum(),
    'Frasi': df['n_sentences'].sum(),
    'Types': len(merge_sets([set(j['tokens']) for j in jsons_maps[CORPUS_NAME]])),
    'Lemmi': len(merge_sets([set(j['lemmas']) for j in jsons_maps[CORPUS_NAME]])),
    'Tokens per frase': df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per frase2': (df['n_tokens'] / df['n_sentences']).mean(),
    'Tokens per documento': df['n_tokens'].sum() / df['document'].nunique(),
    'Tokens per paragrafo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per documento': df['n_sentences'].sum() / df['document'].nunique(),
    'Frasi per paragrafo': df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per frase2,Tokens per documento,Tokens per paragrafo,Frasi per documento,Frasi per paragrafo
0,original,33295,37429,191925,196071,79438,1314,5622,4096,25.338661,29.677564,4161.875,53.788368,164.25,2.122779
1,human1,34135,38193,181872,185945,76008,1506,5270,3640,22.666003,26.08985,4266.875,55.145396,188.25,2.432956
2,human2,29755,33933,166464,170654,69169,1744,5143,3693,17.061353,19.531686,3719.375,48.069467,218.0,2.817447
3,gpt3_5,30032,33836,169761,173567,70641,1515,5054,3560,19.823102,22.025924,3754.0,48.516963,189.375,2.447496
4,gpt4,31722,36017,175147,179442,73110,1840,4930,3376,17.240217,19.311389,3965.25,51.247173,230.0,2.972536
5,llama3,36035,41655,199251,204884,83598,1944,5246,3735,18.536523,21.486218,4504.375,58.214863,243.0,3.140549
6,phi3,36056,41748,199362,205060,83673,1900,5269,3758,18.976842,21.45154,4507.0,58.248788,237.5,3.069467


## Pos

In [6]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':                     CORPUS_NAME,
    'Altro':                      df['n_other'].sum(),
    'Nomi':                       df['n_nouns'].sum(),
    'Verbi':                      df['n_verbs'].sum(),
    'Numeri':                     df['n_number'].sum(),
    'Simboli':                    df['n_symbols'].sum(),
    'Avverbi':                    df['n_adverbs'].sum(),
    'Articoli':                   df['n_articles'].sum(),
    'Pronomi':                    df['n_pronouns'].sum(),
    'Particelle':                 df['n_particles'].sum(),
    'Agettivi':                   df['n_adjectives'].sum(),
    'Preposizioni':               df['n_prepositions'].sum(),
    'Nomi propri':                df['n_proper_nouns'].sum(),
    'Punteggiatura':              df['n_punctuations'].sum(),
    'Interiezioni':               df['n_interjections'].sum(),
    'Cong. coord.':               df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':                 df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,original,26,10206,3496,739,32,786,3313,658,0,3630,7786,1055,4089,0,1445,168
1,human1,22,9493,5047,763,34,944,4039,1266,0,2858,6584,1163,4013,0,1543,424
2,human2,22,8891,3975,707,33,796,3838,650,0,2740,5620,1021,4135,0,1196,309
3,gpt3_5,16,8395,4390,700,32,784,3775,750,0,2911,5708,961,3775,1,1257,381
4,gpt4,9,8484,5142,780,36,1035,4400,917,0,2794,5327,954,4264,0,1378,497
5,llama3,9,10736,4839,743,36,797,4830,640,0,3272,7016,979,5582,1,1807,368
6,phi3,9,10811,4829,734,34,797,4779,621,0,3289,7043,996,5615,0,1805,386


In [7]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':                     CORPUS_NAME,
    'Altro':                      round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                       round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':                      round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':                     round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':                    round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':                    round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':                   round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':                    round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':                 round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':                   round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':               round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':                round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':              round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':               round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':           round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':          round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,original,0.07,27.83,9.83,1.72,0.08,2.19,8.91,1.66,0.0,9.77,21.13,2.54,10.16,0.0,3.68,0.43
1,human1,0.05,25.09,13.9,1.69,0.08,2.6,10.77,3.37,0.0,7.49,17.43,2.73,9.76,0.0,3.87,1.17
2,human2,0.06,26.64,12.1,1.85,0.08,2.59,11.17,1.91,0.0,8.24,17.17,2.6,11.29,0.0,3.39,0.9
3,gpt3_5,0.04,24.92,13.86,1.83,0.08,2.63,11.2,2.28,0.0,8.4,17.02,2.51,10.44,0.0,3.54,1.25
4,gpt4,0.02,23.6,15.15,1.9,0.09,3.09,12.42,2.76,0.0,7.65,14.93,2.35,10.91,0.0,3.66,1.45
5,llama3,0.03,25.33,12.67,1.83,0.09,2.27,11.95,1.73,0.0,7.79,16.72,2.26,12.26,0.0,4.08,0.99
6,phi3,0.03,25.31,12.69,1.81,0.09,2.31,11.94,1.68,0.0,7.87,16.6,2.28,12.31,0.0,4.06,1.02


## Passive

In [8]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':         CORPUS_NAME,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,original,2684,812
1,human1,4124,923
2,human2,3184,791
3,gpt3_5,3662,728
4,gpt4,4450,692
5,llama3,4056,783
6,phi3,4020,809


In [9]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':         CORPUS_NAME,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,original,75.89,20.87
1,human1,83.42,15.78
2,human2,79.38,17.71
3,gpt3_5,84.23,13.99
4,gpt4,87.67,12.0
5,llama3,83.87,15.81
6,phi3,84.12,15.72


## NVdB

In [10]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':   CORPUS_NAME,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,original,24185,20113,4205,3823
1,human1,27155,24097,3174,3553
2,human2,22780,19561,3196,2794
3,gpt3_5,23293,20166,3148,2850
4,gpt4,25513,22549,2987,2931
5,llama3,28829,25200,3655,3898
6,phi3,28849,25203,3670,3966


In [11]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus': CORPUS_NAME,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,original,73.24,60.96,12.61,11.37
1,human1,80.44,71.75,8.99,10.08
2,human2,76.89,66.09,10.68,9.23
3,gpt3_5,78.28,68.13,10.12,9.14
4,gpt4,81.07,71.91,9.12,9.03
5,llama3,80.18,70.16,10.04,10.19
6,phi3,80.16,70.09,10.11,10.19


## Readability

In [12]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':             CORPUS_NAME,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,original,86.15,44.31,19.97,0.55
1,human1,84.56,49.72,34.23,0.54
2,human2,86.3,50.64,33.63,0.56
3,gpt3_5,87.51,48.49,30.33,0.56
4,gpt4,86.64,51.34,36.75,0.56
5,llama3,82.26,50.26,34.09,0.55
6,phi3,82.09,50.16,33.75,0.55


## ReadIT

In [13]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':             CORPUS_NAME,
    'readit_base':        round(df['readit_base'].mean() * 100, 2),
    'readit_lexical':     round(df['readit_lexical'].mean() * 100, 2),
    'readit_syntactic':   round(df['readit_syntactic'].mean() * 100, 2),
    'readit_global':      round(df['readit_global'].mean() * 100, 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,readit_base,readit_lexical,readit_syntactic,readit_global
0,original,75.91,93.64,63.72,86.48
1,human1,68.62,85.37,53.14,69.24
2,human2,51.0,89.71,40.09,61.34
3,gpt3_5,66.61,91.96,38.42,68.69
4,gpt4,55.0,90.29,29.92,54.6
5,llama3,58.37,77.13,40.97,59.26
6,phi3,57.69,75.74,41.24,58.37


## Semantic similarity

In [14]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  if CORPUS_NAME == 'original':
    continue
  d.append({
    'Corpus':               f'original vs {CORPUS_NAME}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,original vs human1,96.52
1,original vs human2,97.26
2,original vs gpt3_5,96.06
3,original vs gpt4,95.8
4,original vs llama3,94.96
5,original vs phi3,94.96


## Distance

In [15]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  if CORPUS_NAME == 'original':
    continue
  d.append({
    'Corpus':                 f'original vs {CORPUS_NAME}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,original vs human1,67468,10256,9044,88.18,9838,6629,67.38
1,original vs human2,63642,6627,5776,87.16,9552,6778,70.96
2,original vs gpt3_5,93928,10835,9047,83.5,13332,9506,71.3
3,original vs gpt4,105741,13788,11952,86.68,14864,10456,70.34
4,original vs llama3,140701,16461,13806,83.87,14736,10190,69.15
5,original vs phi3,141033,16373,13720,83.8,14748,10200,69.16


In [16]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  if CORPUS_NAME == 'original':
    continue
  d.append({
    'Corpus':             f'original vs {CORPUS_NAME}',
    'editdistance':       round((df['editdistance'] / pd.concat([dfs_maps['original']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,original vs human1,35.84,33.84,29.8,32.93,22.53
1,original vs human2,29.2,22.26,19.32,32.99,23.73
2,original vs gpt3_5,49.21,40.12,33.62,50.61,36.65
3,original vs gpt4,52.14,45.26,39.43,50.44,36.14
4,original vs llama3,55.48,42.97,36.41,50.9,35.99
5,original vs phi3,55.44,42.64,36.13,51.32,36.35
