## Setup

In [1]:
import json
import pandas as pd

# Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load datasets

In [3]:
TEXTS = ['text', 'proofreading_text', 'lex_text', 'connectives_text', 'expressions_text', 'sentence_splitter_text', 'nominalizations_text', 'verbs_text', 'sentence_reorganizer_text']

In [4]:
metrics_dfs = {TEXT:[] for TEXT in TEXTS}
raw_data = {TEXT:[] for TEXT in TEXTS}

for TEXT in TEXTS:
    metrics_dfs[TEXT] = pd.read_csv(f'./metrics/corpus_val/{TEXT}_metrics.csv')
    raw_data[TEXT] = json.load(open(f'./metrics/corpus_val/{TEXT}_raw_data.json'))

# Basic

In [5]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':               TEXT,
    'Tokens':               df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri':            df[f'n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe':              df['n_syllables'].sum(),
    'Frasi':                df['n_sentences'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Lemmi':                len(merge_sets([set(j['lemmas']) for j in raw_data[TEXT]])),
    'Tokens per frase':     df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per paragrafo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per paragrafo':  df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per paragrafo,Frasi per paragrafo
0,text,18778,21356,109508,112091,45310,704,4223,3104,26.673295,329.438596,12.350877
1,proofreading_text,18862,21767,109787,112697,45458,768,4189,3068,24.559896,330.912281,13.473684
2,lex_text,18772,21652,109415,112300,45242,773,4169,3051,24.284605,329.333333,13.561404
3,connectives_text,18615,21487,108498,111375,44860,770,4136,3025,24.175325,326.578947,13.508772
4,expressions_text,16286,18996,94796,97512,39225,790,3765,2775,20.61519,285.719298,13.859649
5,sentence_splitter_text,16697,19503,97494,100306,40374,1046,3817,2795,15.962715,292.929825,18.350877
6,nominalizations_text,16552,19350,96542,99346,40095,1045,3877,2823,15.839234,290.385965,18.333333
7,verbs_text,16703,19503,97485,100291,40393,1036,3891,2820,16.122587,293.035088,18.175439
8,sentence_reorganizer_text,16825,19598,98155,100935,40677,1032,3880,2813,16.303295,295.175439,18.105263


# Pos

In [6]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':        TEXT,
    'Altro':         df['n_other'].sum(),
    'Nomi':          df['n_nouns'].sum(),
    'Verbi':         df['n_verbs'].sum(),
    'Numeri':        df['n_number'].sum(),
    'Simboli':       df['n_symbols'].sum(),
    'Avverbi':       df['n_adverbs'].sum(),
    'Articoli':      df['n_articles'].sum(),
    'Pronomi':       df['n_pronouns'].sum(),
    'Particelle':    df['n_particles'].sum(),
    'Agettivi':      df['n_adjectives'].sum(),
    'Preposizioni':  df['n_prepositions'].sum(),
    'Nomi propri':   df['n_proper_nouns'].sum(),
    'Punteggiatura': df['n_punctuations'].sum(),
    'Interiezioni':  df['n_interjections'].sum(),
    'Cong. coord.':  df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':    df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,text,16,5694,1823,440,55,475,1688,378,0,2286,4385,603,2526,0,915,72
1,proofreading_text,19,5697,1807,448,49,473,1709,374,0,2296,4434,619,2854,0,922,66
2,lex_text,18,5646,1806,427,49,473,1708,374,0,2288,4420,626,2831,0,920,66
3,connectives_text,17,5513,1881,427,49,500,1787,374,0,2278,4226,628,2823,0,898,86
4,expressions_text,17,4787,1772,434,51,393,1676,276,0,1995,3434,606,2646,0,838,71
5,sentence_splitter_text,16,4901,1968,435,51,430,1810,285,0,2009,3383,612,2739,0,799,65
6,nominalizations_text,16,4578,2304,436,51,470,1860,311,0,1964,3143,609,2731,0,799,78
7,verbs_text,16,4703,2124,436,50,471,2061,433,0,1946,3012,614,2733,0,803,101
8,sentence_reorganizer_text,15,4761,2123,436,50,467,2124,427,0,1949,3014,622,2706,0,806,98


In [7]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':              TEXT,
    'Altro':               round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':               round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':              round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':             round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':             round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':            round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':             round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':          round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':            round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':        round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':         round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':       round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':        round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':    round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':   round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,text,0.08,27.73,9.04,1.97,0.14,2.07,7.51,1.64,0.0,10.17,20.4,2.27,12.22,0.0,4.41,0.35
1,proofreading_text,0.09,27.16,8.81,1.98,0.12,2.01,7.55,1.61,0.0,9.98,20.33,2.31,13.3,0.0,4.42,0.32
2,lex_text,0.07,27.09,8.87,1.84,0.12,2.02,7.61,1.62,0.0,10.01,20.44,2.37,13.18,0.0,4.43,0.32
3,connectives_text,0.1,26.66,9.34,1.85,0.12,2.11,8.12,1.68,0.0,9.99,19.58,2.39,13.27,0.0,4.35,0.43
4,expressions_text,0.11,26.13,10.06,2.02,0.14,2.0,8.99,1.3,0.0,9.81,18.27,2.52,13.68,0.0,4.55,0.4
5,sentence_splitter_text,0.09,25.97,10.8,1.97,0.14,2.23,9.55,1.26,0.0,9.64,17.42,2.48,13.89,0.0,4.2,0.36
6,nominalizations_text,0.09,24.31,12.72,1.99,0.14,2.47,10.06,1.53,0.0,9.32,16.32,2.47,13.89,0.0,4.25,0.43
7,verbs_text,0.09,24.93,11.74,1.97,0.13,2.44,11.26,2.25,0.0,9.07,15.27,2.46,13.54,0.0,4.23,0.6
8,sentence_reorganizer_text,0.06,25.22,11.64,1.97,0.13,2.41,11.72,2.19,0.0,9.0,15.2,2.48,13.14,0.0,4.26,0.57


# Passive

In [8]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,1447,376
1,proofreading_text,1442,365
2,lex_text,1441,365
3,connectives_text,1514,367
4,expressions_text,1391,381
5,sentence_splitter_text,1505,463
6,nominalizations_text,1885,419
7,verbs_text,1962,162
8,sentence_reorganizer_text,1965,158


In [9]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,80.29,19.71
1,proofreading_text,81.11,18.89
2,lex_text,81.1,18.9
3,connectives_text,81.84,18.16
4,expressions_text,78.75,21.25
5,sentence_splitter_text,76.76,23.24
6,nominalizations_text,84.07,15.93
7,verbs_text,93.04,6.96
8,sentence_reorganizer_text,93.26,6.74


# NVdB

In [10]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,13368,11130,2302,2125
1,proofreading_text,13402,11156,2305,2133
2,lex_text,13343,11109,2293,2130
3,connectives_text,13360,11190,2207,2071
4,expressions_text,11900,9950,1988,1727
5,sentence_splitter_text,12296,10319,2014,1709
6,nominalizations_text,12357,10451,1953,1645
7,verbs_text,12542,10598,1989,1626
8,sentence_reorganizer_text,12657,10693,2008,1631


In [11]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus': TEXT,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,70.81,58.75,12.52,10.85
1,proofreading_text,71.1,59.0,12.52,10.94
2,lex_text,71.21,59.05,12.58,10.96
3,connectives_text,72.03,60.29,12.05,10.58
4,expressions_text,74.2,62.33,12.2,10.12
5,sentence_splitter_text,74.89,63.07,12.17,9.77
6,nominalizations_text,76.3,64.99,11.74,9.54
7,verbs_text,76.99,65.62,11.79,9.35
8,sentence_reorganizer_text,77.24,65.76,11.87,9.32


## Expressions

In [12]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'latinisms':              df['n_latinisms'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,difficult_connectives,latinisms
0,text,233,0
1,proofreading_text,241,0
2,lex_text,241,0
3,connectives_text,29,0
4,expressions_text,47,0
5,sentence_splitter_text,46,0
6,nominalizations_text,46,0
7,verbs_text,53,0
8,sentence_reorganizer_text,52,0


In [13]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'unique_difficult_connectives': len(merge_sets([set(j['difficult_connectives']) for j in raw_data[TEXT]])),
    'unique_latinisms':       len(merge_sets([set(j['latinisms']) for j in raw_data[TEXT]])),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,unique_difficult_connectives,unique_latinisms
0,text,96,0
1,proofreading_text,96,0
2,lex_text,96,0
3,connectives_text,15,0
4,expressions_text,23,0
5,sentence_splitter_text,26,0
6,nominalizations_text,27,0
7,verbs_text,28,0
8,sentence_reorganizer_text,27,0


# Readability

In [14]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':             TEXT,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,text,64.91,42.71,17.83,0.56
1,proofreading_text,64.48,43.48,19.19,0.55
2,lex_text,64.46,43.47,19.36,0.55
3,connectives_text,64.85,43.61,19.71,0.56
4,expressions_text,66.23,45.39,23.64,0.56
5,sentence_splitter_text,66.11,49.65,30.29,0.57
6,nominalizations_text,66.62,49.94,30.53,0.57
7,verbs_text,65.28,49.34,30.49,0.56
8,sentence_reorganizer_text,64.25,48.88,30.13,0.56


# Semantic similarity

In [15]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':               f'original vs {TEXT}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,original vs proofreading_text,99.63
1,original vs lex_text,99.51
2,original vs connectives_text,99.49
3,original vs expressions_text,98.57
4,original vs sentence_splitter_text,98.45
5,original vs nominalizations_text,98.25
6,original vs verbs_text,97.88
7,original vs sentence_reorganizer_text,97.8


# Distance

In [16]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':                 f'original vs {TEXT}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,original vs proofreading_text,1663,285,160,56.14,321,190,59.19
1,original vs lex_text,2282,344,177,51.45,447,256,57.27
2,original vs connectives_text,5625,692,495,71.53,889,580,65.24
3,original vs expressions_text,29177,2011,1599,79.51,3247,2266,69.79
4,original vs sentence_splitter_text,31705,2405,1962,81.58,3330,2324,69.79
5,original vs nominalizations_text,34631,2872,2314,80.57,3780,2588,68.47
6,original vs verbs_text,38864,3176,2591,81.58,4127,2891,70.05
7,original vs sentence_reorganizer_text,41118,3198,2631,82.27,4145,2909,70.18


In [17]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':             f'original vs {TEXT}',
    'editdistance':       round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,original vs proofreading_text,1.66,1.93,1.18,2.37,1.23
1,original vs lex_text,2.43,2.41,1.39,3.38,1.73
2,original vs connectives_text,6.07,5.36,3.95,6.58,4.05
3,original vs expressions_text,23.76,14.6,11.95,20.67,13.72
4,original vs sentence_splitter_text,26.57,17.05,14.3,20.86,13.95
5,original vs nominalizations_text,30.01,21.04,17.55,24.58,16.14
6,original vs verbs_text,35.55,23.53,19.8,27.0,18.15
7,original vs sentence_reorganizer_text,38.47,23.51,19.99,26.75,18.04
