## Setup

In [1]:
import json
import pandas as pd

# Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load datasets

In [3]:
TEXTS = ['text', 'proofreading_text', 'lex_text', 'connectives_text', 'expressions_text', 'sentence_splitter_text', 'nominalizations_text', 'verbs_text', 'sentence_reorganizer_text']

In [4]:
metrics_dfs = {TEXT:[] for TEXT in TEXTS}
raw_data = {TEXT:[] for TEXT in TEXTS}

for TEXT in TEXTS:
    metrics_dfs[TEXT] = pd.read_csv(f'./metrics/corpus_train/{TEXT}_metrics.csv')
    raw_data[TEXT] = json.load(open(f'./metrics/corpus_train/{TEXT}_raw_data.json'))

# Basic

In [5]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':               TEXT,
    'Tokens':               df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri':            df['n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe':              df['n_syllables'].sum(),
    'Frasi':                df['n_sentences'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Lemmi':                len(merge_sets([set(j['lemmas']) for j in raw_data[TEXT]])),
    'Tokens per frase':     df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per paragrafo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per paragrafo':  df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per paragrafo,Frasi per paragrafo
0,text,298955,337869,1731081,1770071,719576,11185,18483,13478,26.728207,293.093137,10.965686
1,proofreading_text,299841,343928,1735367,1779520,721692,11806,17987,13022,25.39734,293.961765,11.57451
2,lex_text,298309,342109,1731138,1775004,718393,11887,17967,13009,25.095398,292.459804,11.653922
3,connectives_text,293866,337370,1709162,1752732,708869,11825,17765,12856,24.851247,288.103922,11.593137
4,expressions_text,260410,301186,1513287,1554127,628319,11828,15968,11617,22.016402,255.303922,11.596078
5,sentence_splitter_text,268622,310818,1563990,1606248,649634,15964,16091,11713,16.826735,263.354902,15.65098
6,nominalizations_text,267334,309525,1552643,1594895,646854,15959,16598,11997,16.7513,262.092157,15.646078
7,verbs_text,269516,311857,1566499,1608899,651025,15948,16743,12089,16.899674,264.231373,15.635294
8,sentence_reorganizer_text,271345,312987,1576724,1618428,655350,15829,16653,12005,17.142271,266.02451,15.518627


# Pos

In [6]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':        TEXT,
    'Altro':         df['n_other'].sum(),
    'Nomi':          df['n_nouns'].sum(),
    'Verbi':         df['n_verbs'].sum(),
    'Numeri':        df['n_number'].sum(),
    'Simboli':       df['n_symbols'].sum(),
    'Avverbi':       df['n_adverbs'].sum(),
    'Articoli':      df['n_articles'].sum(),
    'Pronomi':       df['n_pronouns'].sum(),
    'Particelle':    df['n_particles'].sum(),
    'Agettivi':      df['n_adjectives'].sum(),
    'Preposizioni':  df['n_prepositions'].sum(),
    'Nomi propri':   df['n_proper_nouns'].sum(),
    'Punteggiatura': df['n_punctuations'].sum(),
    'Interiezioni':  df['n_interjections'].sum(),
    'Cong. coord.':  df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':    df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,text,242,90639,30839,7019,460,7507,28559,6424,0,34545,68718,9325,38393,0,13756,1442
1,proofreading_text,238,90624,30819,7153,470,7447,29060,6313,0,34575,69080,9292,43544,0,13888,1425
2,lex_text,240,89774,30798,6662,471,7447,29058,6310,0,34507,68921,9267,43337,0,13892,1425
3,connectives_text,226,87026,31875,6644,468,7621,30078,6259,0,34293,65359,9216,43045,0,13585,1675
4,expressions_text,189,77211,29973,6471,469,6004,28225,4536,0,30460,54294,8831,40286,2,12641,1594
5,sentence_splitter_text,188,79041,33532,6470,468,6614,30272,5296,0,30847,54035,8921,41673,2,11992,1467
6,nominalizations_text,185,73840,39408,6468,469,7291,31009,6058,0,30169,50097,8908,41669,2,12002,1950
7,verbs_text,184,76598,35856,6471,466,7160,35043,7208,0,29890,47817,8980,41820,2,12164,2197
8,sentence_reorganizer_text,182,77564,35790,6464,467,7170,36083,6885,0,29956,47854,9066,41122,3,12209,2171


In [7]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':              TEXT,
    'Altro':               round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':               round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':              round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':             round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':             round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':            round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':             round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':          round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':            round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':        round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':         round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':       round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':        round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':    round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':   round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,text,0.07,27.28,9.44,1.94,0.11,1.99,8.48,1.91,0.0,9.78,20.39,2.72,11.31,0.0,4.17,0.42
1,proofreading_text,0.06,26.77,9.26,1.94,0.11,1.92,8.48,1.84,0.0,9.6,20.19,2.64,12.67,0.0,4.12,0.41
2,lex_text,0.07,26.69,9.29,1.85,0.11,1.93,8.51,1.85,0.0,9.62,20.25,2.63,12.67,0.0,4.13,0.41
3,connectives_text,0.06,26.21,9.81,1.86,0.12,2.07,8.94,1.84,0.0,9.72,19.4,2.64,12.77,0.0,4.09,0.49
4,expressions_text,0.06,25.97,10.44,1.97,0.12,1.91,9.54,1.5,0.0,9.63,18.15,2.74,13.16,0.0,4.25,0.56
5,sentence_splitter_text,0.06,25.75,11.36,1.9,0.12,2.1,10.0,1.67,0.0,9.38,17.42,2.67,13.24,0.0,3.85,0.49
6,nominalizations_text,0.06,23.87,13.53,1.9,0.12,2.37,10.33,2.07,0.0,9.18,16.05,2.68,13.28,0.0,3.87,0.69
7,verbs_text,0.06,24.72,12.15,1.88,0.11,2.31,11.85,2.47,0.0,8.98,14.97,2.66,13.16,0.0,3.92,0.77
8,sentence_reorganizer_text,0.06,25.03,12.04,1.87,0.11,2.29,12.25,2.32,0.0,8.97,14.93,2.68,12.76,0.0,3.92,0.76


# Passive

In [8]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,24500,6339
1,proofreading_text,24426,6393
2,lex_text,24405,6393
3,connectives_text,25518,6357
4,expressions_text,24180,5793
5,sentence_splitter_text,25978,7554
6,nominalizations_text,32404,7004
7,verbs_text,33054,2802
8,sentence_reorganizer_text,33113,2677


In [9]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,80.3,19.51
1,proofreading_text,80.12,19.68
2,lex_text,80.11,19.69
3,connectives_text,81.06,18.74
4,expressions_text,81.57,18.34
5,sentence_splitter_text,77.48,22.42
6,nominalizations_text,82.67,17.33
7,verbs_text,92.91,6.99
8,sentence_reorganizer_text,93.37,6.53


# NVdB

In [10]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,216430,180317,37107,33712
1,proofreading_text,217105,180947,37171,33669
2,lex_text,216290,180194,37109,33650
3,connectives_text,215236,180323,35533,32287
4,expressions_text,193619,162363,31801,27188
5,sentence_splitter_text,201459,169496,32556,26788
6,nominalizations_text,203750,172671,31709,25966
7,verbs_text,206514,174716,32429,25555
8,sentence_reorganizer_text,208197,176078,32761,25606


In [11]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus': TEXT,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,72.09,59.68,12.7,11.22
1,proofreading_text,72.11,59.71,12.71,11.17
2,lex_text,72.18,59.74,12.75,11.21
3,connectives_text,72.91,60.72,12.37,10.88
4,expressions_text,74.46,62.4,12.23,10.47
5,sentence_splitter_text,75.18,63.22,12.14,9.92
6,nominalizations_text,76.67,65.13,11.75,9.61
7,verbs_text,77.33,65.57,11.96,9.35
8,sentence_reorganizer_text,77.49,65.67,12.03,9.3


## Expressions

In [12]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'latinisms':              df['n_latinisms'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,difficult_connectives,latinisms
0,text,3772,23
1,proofreading_text,3854,23
2,lex_text,3856,23
3,connectives_text,380,18
4,expressions_text,771,5
5,sentence_splitter_text,755,5
6,nominalizations_text,711,5
7,verbs_text,733,5
8,sentence_reorganizer_text,723,5


In [13]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'unique_difficult_connectives': len(merge_sets([set(j['difficult_connectives']) for j in raw_data[TEXT]])),
    'unique_latinisms':       len(merge_sets([set(j['latinisms']) for j in raw_data[TEXT]])),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,unique_difficult_connectives,unique_latinisms
0,text,402,10
1,proofreading_text,399,10
2,lex_text,399,10
3,connectives_text,81,9
4,expressions_text,139,4
5,sentence_splitter_text,148,4
6,nominalizations_text,145,4
7,verbs_text,139,4
8,sentence_reorganizer_text,138,4


# Readability

In [14]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':             TEXT,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,text,65.41,42.73,19.05,0.55
1,proofreading_text,65.1,43.61,20.52,0.55
2,lex_text,65.09,43.63,20.9,0.54
3,connectives_text,65.52,43.65,20.8,0.55
4,expressions_text,66.74,45.01,23.76,0.55
5,sentence_splitter_text,66.64,49.43,31.02,0.56
6,nominalizations_text,67.22,49.76,31.06,0.57
7,verbs_text,66.04,49.47,31.43,0.56
8,sentence_reorganizer_text,65.15,49.11,31.11,0.55


# Semantic similarity

In [15]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':               f'original vs {TEXT}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,original vs proofreading_text,99.63
1,original vs lex_text,99.52
2,original vs connectives_text,99.37
3,original vs expressions_text,98.5
4,original vs sentence_splitter_text,98.35
5,original vs nominalizations_text,98.18
6,original vs verbs_text,97.9
7,original vs sentence_reorganizer_text,97.82


# Distance

In [16]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':                 f'original vs {TEXT}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,original vs proofreading_text,29057,4448,2657,59.73,5302,2953,55.7
1,original vs lex_text,41762,5910,3097,52.4,7917,3988,50.37
2,original vs connectives_text,105146,11959,8377,70.05,16740,10333,61.73
3,original vs expressions_text,446859,31682,24821,78.34,52697,36400,69.07
4,original vs sentence_splitter_text,496369,39141,31798,81.24,53324,36814,69.04
5,original vs nominalizations_text,550938,47403,38223,80.63,60851,41332,67.92
6,original vs verbs_text,614708,51958,42542,81.88,66055,45643,69.1
7,original vs sentence_reorganizer_text,656257,52215,42901,82.16,66290,45834,69.14


In [17]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':             f'original vs {TEXT}',
    'editdistance':       round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,original vs proofreading_text,2.04,2.11,1.31,2.22,1.29
1,original vs lex_text,2.86,2.73,1.53,3.23,1.7
2,original vs connectives_text,6.64,5.65,3.93,11.16,7.48
3,original vs expressions_text,23.41,14.83,11.64,28.68,20.04
4,original vs sentence_splitter_text,27.06,17.54,14.28,21.3,14.55
5,original vs nominalizations_text,30.98,21.66,17.64,25.25,16.88
6,original vs verbs_text,35.44,23.64,19.62,27.22,18.54
7,original vs sentence_reorganizer_text,38.28,23.52,19.6,26.97,18.38
