## Setup

In [1]:
import json
import pandas as pd

# Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load datasets

In [3]:
TEXTS = ['text', 'proofreading_text', 'lex_text', 'connectives_text', 'expressions_text', 'sentence_splitter_text', 'nominalizations_text', 'verbs_text', 'sentence_reorganizer_text']

In [4]:
metrics_dfs = {TEXT:[] for TEXT in TEXTS}
raw_data = {TEXT:[] for TEXT in TEXTS}

for TEXT in TEXTS:
    metrics_dfs[TEXT] = pd.read_csv(f'./metrics/corpus_test/{TEXT}_metrics.csv')
    raw_data[TEXT] = json.load(open(f'./metrics/corpus_test/{TEXT}_raw_data.json'))

# Basic

In [5]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':               TEXT,
    'Tokens':               df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri':            df[f'n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe':              df['n_syllables'].sum(),
    'Frasi':                df['n_sentences'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Lemmi':                len(merge_sets([set(j['lemmas']) for j in raw_data[TEXT]])),
    'Tokens per frase':     df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per paragrafo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per paragrafo':  df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per paragrafo,Frasi per paragrafo
0,text,13720,15447,80844,82571,33612,490,3455,2559,28.0,240.701754,8.596491
1,proofreading_text,13752,15648,80850,82746,33647,503,3427,2537,27.33996,241.263158,8.824561
2,lex_text,13729,15620,80705,82596,33549,504,3415,2526,27.240079,240.859649,8.842105
3,connectives_text,13671,15562,80402,82293,33411,504,3422,2529,27.125,239.842105,8.842105
4,expressions_text,13208,15077,77672,79541,32293,505,3345,2473,26.154455,231.719298,8.859649
5,sentence_splitter_text,13531,15478,79747,81695,33138,706,3371,2491,19.165722,237.385965,12.385965
6,nominalizations_text,13471,15419,79188,81137,32984,707,3410,2507,19.053748,236.333333,12.403509
7,verbs_text,13433,15391,78978,80937,32835,707,3415,2509,19.0,235.666667,12.403509
8,sentence_reorganizer_text,13422,15350,78880,80809,32795,707,3402,2499,18.984441,235.473684,12.403509


# Pos

In [6]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':        TEXT,
    'Altro':         df['n_other'].sum(),
    'Nomi':          df['n_nouns'].sum(),
    'Verbi':         df['n_verbs'].sum(),
    'Numeri':        df['n_number'].sum(),
    'Simboli':       df['n_symbols'].sum(),
    'Avverbi':       df['n_adverbs'].sum(),
    'Articoli':      df['n_articles'].sum(),
    'Pronomi':       df['n_pronouns'].sum(),
    'Particelle':    df['n_particles'].sum(),
    'Agettivi':      df['n_adjectives'].sum(),
    'Preposizioni':  df['n_prepositions'].sum(),
    'Nomi propri':   df['n_proper_nouns'].sum(),
    'Punteggiatura': df['n_punctuations'].sum(),
    'Interiezioni':  df['n_interjections'].sum(),
    'Cong. coord.':  df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':    df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,text,10,4247,1346,233,15,350,1312,248,0,1743,3194,325,1710,0,652,62
1,proofreading_text,8,4267,1334,245,15,352,1309,247,0,1744,3207,320,1877,0,661,62
2,lex_text,8,4252,1335,243,15,352,1308,248,0,1740,3204,319,1873,0,661,62
3,connectives_text,8,4202,1358,243,15,351,1331,249,0,1738,3142,321,1873,0,662,69
4,expressions_text,8,4055,1336,244,15,326,1331,231,0,1683,2953,319,1851,0,657,68
5,sentence_splitter_text,8,4139,1490,245,15,353,1447,245,0,1685,2924,324,1929,0,614,60
6,nominalizations_text,8,3953,1700,245,15,368,1488,257,0,1663,2785,323,1930,0,614,70
7,verbs_text,8,3995,1581,247,15,365,1573,247,0,1653,2735,345,1940,0,617,70
8,sentence_reorganizer_text,8,3989,1579,245,15,364,1585,245,0,1647,2730,350,1910,0,615,68


In [7]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':              TEXT,
    'Altro':               round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':               round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':              round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':             round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':             round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':            round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':             round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':          round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':            round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':        round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':         round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':       round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':        round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':    round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':   round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,text,0.09,28.52,8.61,1.38,0.07,2.08,8.1,1.76,0.0,10.55,21.42,1.97,10.77,0.0,4.28,0.4
1,proofreading_text,0.08,28.18,8.37,1.42,0.07,2.06,7.92,1.7,0.0,10.41,21.2,1.9,11.97,0.0,4.33,0.39
2,lex_text,0.08,28.15,8.38,1.42,0.07,2.07,7.93,1.7,0.0,10.41,21.21,1.9,11.96,0.0,4.34,0.39
3,connectives_text,0.08,27.81,8.67,1.43,0.07,2.05,8.27,1.72,0.0,10.45,20.7,1.92,12.01,0.0,4.36,0.46
4,expressions_text,0.08,27.59,8.98,1.47,0.07,2.02,8.54,1.68,0.0,10.43,19.96,1.97,12.31,0.0,4.46,0.46
5,sentence_splitter_text,0.07,27.41,9.95,1.41,0.07,2.19,9.23,1.67,0.0,10.04,19.06,1.92,12.59,0.0,3.99,0.4
6,nominalizations_text,0.08,25.8,12.06,1.42,0.07,2.29,9.55,2.0,0.0,9.92,17.74,1.9,12.62,0.0,4.01,0.56
7,verbs_text,0.08,26.17,11.24,1.42,0.07,2.28,10.49,1.86,0.0,9.87,17.26,2.08,12.6,0.0,4.04,0.56
8,sentence_reorganizer_text,0.08,26.26,11.24,1.41,0.07,2.27,10.63,1.84,0.0,9.88,17.25,2.06,12.41,0.0,4.03,0.55


# Passive

In [8]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,1077,269
1,proofreading_text,1063,271
2,lex_text,1064,271
3,connectives_text,1087,271
4,expressions_text,1071,265
5,sentence_splitter_text,1169,321
6,nominalizations_text,1369,331
7,verbs_text,1348,233
8,sentence_reorganizer_text,1346,233


In [9]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,84.17,15.83
1,proofreading_text,83.54,16.46
2,lex_text,83.54,16.46
3,connectives_text,83.5,16.5
4,expressions_text,84.27,15.73
5,sentence_splitter_text,80.61,19.39
6,nominalizations_text,82.26,17.74
7,verbs_text,87.86,12.14
8,sentence_reorganizer_text,87.79,12.21


# NVdB

In [10]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,9965,8186,1877,1563
1,proofreading_text,9983,8198,1882,1578
2,lex_text,9970,8185,1882,1579
3,connectives_text,9974,8205,1851,1564
4,expressions_text,9665,7960,1784,1491
5,sentence_splitter_text,9974,8249,1807,1471
6,nominalizations_text,10068,8375,1776,1454
7,verbs_text,10011,8306,1790,1419
8,sentence_reorganizer_text,10015,8312,1789,1416


In [11]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus': TEXT,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,72.32,59.02,13.72,11.46
1,proofreading_text,72.28,58.95,13.74,11.63
2,lex_text,72.3,58.95,13.76,11.64
3,connectives_text,72.84,59.68,13.48,11.68
4,expressions_text,73.21,60.3,13.2,11.46
5,sentence_splitter_text,74.03,61.33,12.99,10.99
6,nominalizations_text,75.63,63.22,12.7,10.7
7,verbs_text,75.6,63.11,12.79,10.4
8,sentence_reorganizer_text,75.72,63.25,12.78,10.4


## Expressions

In [12]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'latinisms':              df['n_latinisms'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,difficult_connectives,latinisms
0,text,166,0
1,proofreading_text,167,0
2,lex_text,167,0
3,connectives_text,129,0
4,expressions_text,109,0
5,sentence_splitter_text,97,0
6,nominalizations_text,94,0
7,verbs_text,94,0
8,sentence_reorganizer_text,94,0


In [13]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'unique_difficult_connectives': len(merge_sets([set(j['difficult_connectives']) for j in raw_data[TEXT]])),
    'unique_latinisms':       len(merge_sets([set(j['latinisms']) for j in raw_data[TEXT]])),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,unique_difficult_connectives,unique_latinisms
0,text,77,0
1,proofreading_text,78,0
2,lex_text,78,0
3,connectives_text,63,0
4,expressions_text,57,0
5,sentence_splitter_text,54,0
6,nominalizations_text,54,0
7,verbs_text,54,0
8,sentence_reorganizer_text,54,0


# Readability

In [14]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':             TEXT,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,text,66.49,41.37,16.56,0.56
1,proofreading_text,66.04,41.73,17.11,0.56
2,lex_text,66.03,41.72,17.2,0.56
3,connectives_text,66.27,41.78,17.55,0.56
4,expressions_text,66.74,42.21,18.43,0.56
5,sentence_splitter_text,66.42,46.86,28.0,0.57
6,nominalizations_text,66.94,47.19,28.21,0.57
7,verbs_text,66.14,47.28,28.52,0.57
8,sentence_reorganizer_text,65.95,47.37,28.93,0.57


# Semantic similarity

In [15]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':               f'original vs {TEXT}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,original vs proofreading_text,99.75
1,original vs lex_text,99.68
2,original vs connectives_text,99.53
3,original vs expressions_text,99.14
4,original vs sentence_splitter_text,98.9
5,original vs nominalizations_text,98.75
6,original vs verbs_text,98.51
7,original vs sentence_reorganizer_text,98.47


# Distance

In [16]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':                 f'original vs {TEXT}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,original vs proofreading_text,471,110,54,49.09,143,73,51.05
1,original vs lex_text,761,147,72,48.98,197,100,50.76
2,original vs connectives_text,1716,288,197,68.4,363,216,59.5
3,original vs expressions_text,6650,579,452,78.07,913,635,69.55
4,original vs sentence_splitter_text,10426,990,823,83.13,1057,755,71.43
5,original vs nominalizations_text,12051,1280,1043,81.48,1328,901,67.85
6,original vs verbs_text,14333,1402,1145,81.67,1543,1086,70.38
7,original vs sentence_reorganizer_text,15287,1404,1147,81.7,1570,1100,70.06


In [17]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':             f'original vs {TEXT}',
    'editdistance':       round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,original vs proofreading_text,0.74,0.95,0.51,1.22,0.67
1,original vs lex_text,1.02,1.18,0.64,1.56,0.86
2,original vs connectives_text,3.14,3.49,2.69,3.95,2.65
3,original vs expressions_text,9.65,6.98,5.65,9.34,6.73
4,original vs sentence_splitter_text,14.84,10.75,9.22,9.9,7.27
5,original vs nominalizations_text,17.9,14.24,12.08,13.0,8.85
6,original vs verbs_text,21.26,15.43,13.04,15.1,10.55
7,original vs sentence_reorganizer_text,22.84,15.39,13.02,15.25,10.6
