## Setup

In [1]:
import json
import pandas as pd

# Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load datasets

In [3]:
TEXTS = ['text', 'proofreading', 'lex', 'connectives', 'expressions', 'sentence_splitter', 'nominalizations', 'verbs', 'sentence_reorganizer']

In [4]:
metrics_dfs = {TEXT:[] for TEXT in TEXTS}
raw_data = {TEXT:[] for TEXT in TEXTS}

for TEXT in TEXTS:
    metrics_dfs[TEXT] = pd.read_csv(f'./metrics/simplification/{TEXT}_metrics.csv')
    raw_data[TEXT] = json.load(open(f'./metrics/simplification/{TEXT}_raw_data.json'))

# Basic

In [5]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':               TEXT,
    'Tokens':               df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri':            df['n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe':              df['n_syllables'].sum(),
    'Frasi':                df['n_sentences'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Lemmi':                len(merge_sets([set(j['lemmas']) for j in raw_data[TEXT]])),
    'Tokens per frase':     df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per paragrafo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per paragrafo':  df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per paragrafo,Frasi per paragrafo
0,text,15197,17083,83486,85394,34753,589,2897,2272,25.801358,208.178082,8.068493
1,proofreading,14793,16865,82842,84934,34145,594,2871,2239,24.90404,202.643836,8.136986
2,lex,14599,16648,82194,84263,33780,585,2847,2222,24.955556,199.986301,8.013699
3,connectives,14415,16459,81333,83397,33412,587,2840,2212,24.55707,197.465753,8.041096
4,expressions,14025,16040,78924,80959,32411,586,2767,2159,23.933447,192.123288,8.027397
5,sentence_splitter,14243,16322,80837,82936,33153,765,2801,2177,18.618301,195.109589,10.479452
6,nominalizations,14130,16204,80126,82220,32906,765,2825,2187,18.470588,193.561644,10.479452
7,verbs,14162,16237,80214,82309,32931,784,2843,2192,18.063776,194.0,10.739726
8,sentence_reorganizer,14229,16289,80861,82941,33140,787,2843,2196,18.080051,194.917808,10.780822


# Pos

In [6]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':        TEXT,
    'Altro':         df['n_other'].sum(),
    'Nomi':          df['n_nouns'].sum(),
    'Verbi':         df['n_verbs'].sum(),
    'Numeri':        df['n_number'].sum(),
    'Simboli':       df['n_symbols'].sum(),
    'Avverbi':       df['n_adverbs'].sum(),
    'Articoli':      df['n_articles'].sum(),
    'Pronomi':       df['n_pronouns'].sum(),
    'Particelle':    df['n_particles'].sum(),
    'Agettivi':      df['n_adjectives'].sum(),
    'Preposizioni':  df['n_prepositions'].sum(),
    'Nomi propri':   df['n_proper_nouns'].sum(),
    'Punteggiatura': df['n_punctuations'].sum(),
    'Interiezioni':  df['n_interjections'].sum(),
    'Cong. coord.':  df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':    df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,text,20,4262,1523,618,15,286,1339,276,0,1342,3349,913,1860,0,455,101
1,proofreading,26,4223,1530,625,10,291,1329,276,1,1331,3321,924,2062,0,461,101
2,lex,27,4113,1528,567,10,291,1328,277,1,1318,3304,927,2043,0,461,101
3,connectives,27,4017,1553,560,8,289,1381,273,1,1307,3168,916,2039,0,458,112
4,expressions,26,3912,1531,559,8,258,1373,248,1,1244,3036,914,2010,0,457,114
5,sentence_splitter,24,4001,1674,557,12,288,1462,243,1,1252,3001,906,2074,0,407,108
6,nominalizations,24,3853,1823,557,12,293,1474,241,1,1237,2894,898,2069,0,407,110
7,verbs,23,3893,1738,559,12,290,1545,279,1,1230,2859,908,2070,0,407,114
8,sentence_reorganizer,19,3929,1755,551,12,293,1581,267,1,1229,2866,915,2055,0,405,114


In [7]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':              TEXT,
    'Altro':               round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':               round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':              round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':             round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':             round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':            round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':             round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':          round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':            round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':        round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':         round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':       round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':        round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':    round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':   round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,text,0.09,24.54,10.04,2.89,0.09,1.8,8.52,2.07,0.0,7.82,19.24,5.39,10.88,0.0,2.69,0.55
1,proofreading,0.15,24.49,10.12,3.0,0.08,1.82,8.56,2.06,0.01,7.71,19.27,5.5,12.19,0.0,2.78,0.53
2,lex,0.15,24.25,10.2,2.79,0.08,1.84,8.64,2.08,0.01,7.74,19.4,5.55,12.22,0.0,2.82,0.54
3,connectives,0.15,23.9,10.43,2.8,0.07,1.84,9.15,2.07,0.01,7.79,18.76,5.56,12.33,0.0,2.83,0.6
4,expressions,0.15,23.9,10.63,2.88,0.08,1.66,9.29,2.0,0.01,7.49,18.44,5.69,12.49,0.0,2.89,0.67
5,sentence_splitter,0.14,24.06,11.35,2.8,0.09,1.83,9.76,1.91,0.01,7.38,17.78,5.49,12.7,0.0,2.49,0.61
6,nominalizations,0.15,23.2,12.54,2.82,0.09,1.88,9.93,1.89,0.01,7.32,17.13,5.52,12.77,0.0,2.51,0.63
7,verbs,0.14,23.41,11.87,2.81,0.09,1.86,10.53,2.26,0.01,7.26,16.71,5.54,12.75,0.0,2.5,0.68
8,sentence_reorganizer,0.12,23.65,11.89,2.74,0.09,1.88,10.77,2.16,0.01,7.25,16.77,5.56,12.45,0.0,2.48,0.68


# Passive

In [8]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,913,158
1,proofreading,914,160
2,lex,912,160
3,connectives,929,164
4,expressions,905,157
5,sentence_splitter,925,190
6,nominalizations,1072,190
7,verbs,1107,150
8,sentence_reorganizer,1129,149


In [9]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,60.36,9.74
1,proofreading,60.5,9.89
2,lex,60.42,9.94
3,connectives,60.62,9.95
4,expressions,59.9,9.23
5,sentence_splitter,56.45,10.21
6,nominalizations,59.35,9.56
7,verbs,65.6,7.2
8,sentence_reorganizer,66.54,6.97


# NVdB

In [10]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,9566,8190,1373,1355
1,proofreading,9548,8183,1365,1351
2,lex,9432,8071,1362,1346
3,connectives,9411,8085,1315,1306
4,expressions,9177,7903,1259,1254
5,sentence_splitter,9423,8114,1292,1200
6,nominalizations,9452,8158,1279,1200
7,verbs,9511,8208,1289,1200
8,sentence_reorganizer,9582,8268,1294,1205


In [11]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus': TEXT,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,65.31,55.79,9.61,8.72
1,proofreading,66.51,56.84,9.78,8.9
2,lex,66.56,56.8,9.87,8.98
3,connectives,67.46,57.86,9.62,8.86
4,expressions,67.78,58.48,9.28,8.64
5,sentence_splitter,68.48,59.01,9.44,8.14
6,nominalizations,69.42,60.05,9.37,8.27
7,verbs,69.89,60.52,9.38,8.2
8,sentence_reorganizer,70.03,60.65,9.38,8.16


## Expressions

In [12]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'latinisms':              df['n_latinisms'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,difficult_connectives,latinisms
0,text,129,2
1,proofreading,158,2
2,lex,158,2
3,connectives,29,2
4,expressions,33,2
5,sentence_splitter,37,2
6,nominalizations,36,2
7,verbs,33,2
8,sentence_reorganizer,33,2


In [13]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'unique_difficult_connectives': len(merge_sets([set(j['difficult_connectives']) for j in raw_data[TEXT]])),
    'unique_latinisms':       len(merge_sets([set(j['latinisms']) for j in raw_data[TEXT]])),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,unique_difficult_connectives,unique_latinisms
0,text,51,1
1,proofreading,66,1
2,lex,66,1
3,connectives,19,1
4,expressions,22,1
5,sentence_splitter,23,1
6,nominalizations,23,1
7,verbs,22,1
8,sentence_reorganizer,22,1


# Readability

In [14]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':             TEXT,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,text,68.16,44.83,24.68,49.66
1,proofreading,68.93,44.33,24.79,50.3
2,lex,69.08,44.1,24.72,50.16
3,connectives,69.09,44.31,25.02,50.13
4,expressions,69.59,45.02,27.95,49.9
5,sentence_splitter,69.9,48.74,33.98,51.12
6,nominalizations,70.24,48.82,34.24,51.52
7,verbs,69.29,49.13,34.87,50.88
8,sentence_reorganizer,68.75,48.87,34.88,51.01


# Semantic similarity

In [15]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':               f'original vs {TEXT}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,original vs proofreading,99.61
1,original vs lex,99.5
2,original vs connectives,99.32
3,original vs expressions,98.8
4,original vs sentence_splitter,98.46
5,original vs nominalizations,98.3
6,original vs verbs,97.98
7,original vs sentence_reorganizer,97.88


# Distance

In [16]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':                 f'original vs {TEXT}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,original vs proofreading,2042,603,244,40.46,812,247,30.42
1,original vs lex,3229,692,278,40.17,1057,375,35.48
2,original vs connectives,5635,928,511,55.06,1410,608,43.12
3,original vs expressions,10881,1364,877,64.3,2072,1088,52.51
4,original vs sentence_splitter,14906,1771,1230,69.45,2306,1202,52.12
5,original vs nominalizations,16423,1948,1374,70.53,2506,1296,51.72
6,original vs verbs,19088,2175,1570,72.18,2765,1501,54.29
7,original vs sentence_reorganizer,20263,2205,1573,71.34,2850,1507,52.88


In [17]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':             f'original vs {TEXT}',
    'editdistance':       round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,original vs proofreading,2.38,4.57,2.16,5.49,2.04
1,original vs lex,3.39,5.03,2.33,6.86,2.7
2,original vs connectives,6.18,6.87,4.13,9.66,4.41
3,original vs expressions,14.71,12.72,9.12,17.47,10.23
4,original vs sentence_splitter,19.25,15.83,11.83,18.55,10.75
5,original vs nominalizations,21.38,17.5,13.27,20.46,11.55
6,original vs verbs,24.81,19.35,15.03,22.66,13.41
7,original vs sentence_reorganizer,27.2,19.4,14.91,23.26,13.5


# Slide

In [18]:
all = []
for TEXT, df in metrics_dfs.items():
  d = {        
    'Corpus':             f'{TEXT}',
    'Tokens':               df['n_tokens'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Frasi':                df['n_sentences'].sum(),
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2)
  }
  if TEXT != 'text':
    d['semantic_similarity'] = round(df['semantic_similarity'].mean(), 2)
    d['editdistance'] = round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2)
  all.append(d)
  
pd.DataFrame(all).head(10)

Unnamed: 0,Corpus,Tokens,Types,Frasi,Verbi attivi,ALL,difficult_connectives,gulpease_index,flesch_vacca,semantic_similarity,editdistance
0,text,15197,2897,589,60.36,65.31,129,44.83,24.68,,
1,proofreading,14793,2871,594,60.5,66.51,158,44.33,24.79,99.61,2.38
2,lex,14599,2847,585,60.42,66.56,158,44.1,24.72,99.5,3.39
3,connectives,14415,2840,587,60.62,67.46,29,44.31,25.02,99.32,6.18
4,expressions,14025,2767,586,59.9,67.78,33,45.02,27.95,98.8,14.71
5,sentence_splitter,14243,2801,765,56.45,68.48,37,48.74,33.98,98.46,19.25
6,nominalizations,14130,2825,765,59.35,69.42,36,48.82,34.24,98.3,21.38
7,verbs,14162,2843,784,65.6,69.89,33,49.13,34.87,97.98,24.81
8,sentence_reorganizer,14229,2843,787,66.54,70.03,33,48.87,34.88,97.88,27.2
