# Metrics overview

## Libs

In [1]:
import json

import pandas as pd

from utils import loaders

## Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load data

In [3]:
dfs_maps, jsons_maps = loaders.load_metrics_dfs()

Loading metrics...


## Basic

In [4]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus': CORPUS_NAME,
    'Tokens': df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri': df[f'n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe': df['n_syllables'].sum(),
    'Frasi': df['n_sentences'].sum(),
    'Types': len(merge_sets([set(j['tokens']) for j in jsons_maps[CORPUS_NAME]])),
    'Lemmi': len(merge_sets([set(j['lemmas']) for j in jsons_maps[CORPUS_NAME]])),
    'Tokens per frase': df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per documento': df['n_tokens'].sum() / df['topic'].nunique(),
    'Tokens per articolo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per documento': df['n_sentences'].sum() / df['topic'].nunique(),
    'Frasi per articolo': df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per documento,Tokens per articolo,Frasi per documento,Frasi per articolo
0,original,13166,14858,74362,76058,31070,519,3187,2431,25.368015,1645.75,248.415094,64.875,9.792453
1,basic,9171,10562,49082,50475,20786,552,2365,1765,16.61413,1146.375,173.037736,69.0,10.415094
2,mini_basic,9851,11531,53871,55559,22741,600,2491,1856,16.418333,1231.375,185.867925,75.0,11.320755
3,mini_chain0,13129,14901,73932,75707,30937,542,3158,2405,24.223247,1641.125,247.716981,67.75,10.226415
4,mini_chain1,13055,14826,73447,75221,30738,543,3128,2374,24.042357,1631.875,246.320755,67.875,10.245283
5,mini_chain2,13048,14820,73207,74982,30645,543,3101,2356,24.029466,1631.0,246.188679,67.875,10.245283
6,mini_chain3,13315,15141,74973,76801,31389,687,3135,2372,19.381368,1664.375,251.226415,85.875,12.962264
7,mini_chain4,13488,15307,75763,77584,31752,688,3132,2363,19.604651,1686.0,254.490566,86.0,12.981132
8,mini_chain5,13469,15283,75714,77530,31704,689,3139,2365,19.548621,1683.625,254.132075,86.125,13.0
9,mini_chain6,13381,15195,75401,77217,31586,689,3150,2373,19.4209,1672.625,252.471698,86.125,13.0


## Pos

In [5]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':                     CORPUS_NAME,
    'Altro':                      df['n_other'].sum(),
    'Nomi':                       df['n_nouns'].sum(),
    'Verbi':                      df['n_verbs'].sum(),
    'Numeri':                     df['n_number'].sum(),
    'Simboli':                    df['n_symbols'].sum(),
    'Avverbi':                    df['n_adverbs'].sum(),
    'Articoli':                   df['n_articles'].sum(),
    'Pronomi':                    df['n_pronouns'].sum(),
    'Particelle':                 df['n_particles'].sum(),
    'Agettivi':                   df['n_adjectives'].sum(),
    'Preposizioni':               df['n_prepositions'].sum(),
    'Nomi propri':                df['n_proper_nouns'].sum(),
    'Punteggiatura':              df['n_punctuations'].sum(),
    'Interiezioni':               df['n_interjections'].sum(),
    'Cong. coord.':               df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':                 df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,original,10,4131,1416,346,10,290,1048,277,0,1587,3096,200,1689,0,668,90
1,basic,7,2666,1357,294,10,317,1082,274,0,873,1529,159,1381,0,466,147
2,mini_basic,10,2928,1479,298,9,274,1093,296,0,963,1748,154,1668,0,477,134
3,mini_chain0,8,4109,1400,365,10,290,1047,276,0,1591,3087,199,1766,0,665,88
4,mini_chain1,8,4077,1395,365,10,285,1062,277,0,1581,3050,199,1765,0,664,88
5,mini_chain2,8,4063,1406,364,10,282,1084,289,0,1570,3027,199,1766,0,663,89
6,mini_chain3,8,4113,1584,360,10,299,1150,325,0,1566,2992,198,1821,0,632,83
7,mini_chain4,8,4176,1585,360,10,299,1261,313,0,1576,2987,198,1814,0,635,85
8,mini_chain5,8,4196,1482,360,10,299,1332,354,0,1566,2945,198,1809,0,636,88
9,mini_chain6,8,4109,1580,360,10,307,1290,362,0,1555,2884,198,1809,0,635,88


In [6]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':                     CORPUS_NAME,
    'Altro':                      round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                       round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':                      round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':                     round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':                    round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':                    round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':                   round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':                    round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':                 round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':                   round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':               round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':                round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':              round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':               round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':           round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':          round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,original,0.06,27.77,9.78,2.23,0.07,1.85,7.37,2.01,0.0,10.74,20.74,1.34,11.08,0.0,4.3,0.65
1,basic,0.04,25.06,13.19,2.65,0.08,3.1,10.99,2.82,0.0,8.0,14.42,1.44,12.44,0.0,4.24,1.53
2,mini_basic,0.06,25.37,13.49,2.46,0.07,2.4,10.23,2.76,0.0,8.02,15.05,1.28,13.5,0.0,4.05,1.26
3,mini_chain0,0.03,27.51,9.66,2.34,0.06,1.86,7.37,2.0,0.0,10.72,20.63,1.29,11.62,0.0,4.25,0.63
4,mini_chain1,0.03,27.4,9.67,2.36,0.06,1.83,7.57,2.0,0.0,10.75,20.42,1.31,11.69,0.0,4.26,0.63
5,mini_chain2,0.03,27.27,9.77,2.34,0.06,1.8,7.84,2.12,0.0,10.66,20.16,1.32,11.7,0.0,4.27,0.64
6,mini_chain3,0.03,27.03,10.8,2.26,0.06,1.86,8.05,2.46,0.0,10.35,19.51,1.29,11.79,0.0,3.93,0.57
7,mini_chain4,0.03,27.22,10.59,2.25,0.06,1.84,8.84,2.26,0.0,10.34,19.15,1.32,11.59,0.0,3.95,0.57
8,mini_chain5,0.03,27.42,9.94,2.25,0.06,1.82,9.46,2.6,0.0,10.3,18.66,1.29,11.57,0.0,3.98,0.61
9,mini_chain6,0.03,27.01,10.63,2.26,0.06,1.88,9.25,2.67,0.0,10.29,18.35,1.3,11.66,0.0,4.0,0.61


## Passive

In [7]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':         CORPUS_NAME,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,original,1030,386
1,basic,1145,212
2,mini_basic,1176,303
3,mini_chain0,1009,391
4,mini_chain1,1012,383
5,mini_chain2,1006,400
6,mini_chain3,1076,508
7,mini_chain4,1100,485
8,mini_chain5,1231,251
9,mini_chain6,1329,251


In [8]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':         CORPUS_NAME,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,original,73.62,26.38
1,basic,86.09,13.91
2,mini_basic,79.71,20.29
3,mini_chain0,73.3,26.7
4,mini_chain1,73.83,26.17
5,mini_chain2,73.08,26.92
6,mini_chain3,69.84,30.16
7,mini_chain4,72.3,27.7
8,mini_chain5,85.14,14.86
9,mini_chain6,86.13,13.87


## NVdB

In [9]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':   CORPUS_NAME,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,original,9586,7941,1685,1659
1,basic,7393,6450,947,913
2,mini_basic,7818,6788,1047,1014
3,mini_chain0,9566,7945,1660,1655
4,mini_chain1,9563,7979,1615,1639
5,mini_chain2,9612,8035,1607,1634
6,mini_chain3,9881,8289,1621,1606
7,mini_chain4,10060,8449,1648,1614
8,mini_chain5,10062,8460,1636,1622
9,mini_chain6,10027,8446,1622,1598


In [10]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus': CORPUS_NAME,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,original,72.42,60.02,12.6,11.9
1,basic,80.65,70.45,10.07,9.64
2,mini_basic,79.01,68.92,10.07,9.67
3,mini_chain0,72.7,60.31,12.59,11.91
4,mini_chain1,73.24,61.11,12.23,11.83
5,mini_chain2,73.8,61.79,12.09,11.82
6,mini_chain3,74.39,62.52,11.95,11.33
7,mini_chain4,74.73,62.73,12.1,11.26
8,mini_chain5,75.1,63.07,12.11,11.25
9,mini_chain6,75.35,63.45,12.02,11.14


## Readability

In [11]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':             CORPUS_NAME,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,original,67.14,44.69,25.0,0.56
1,basic,68.6,53.56,41.28,0.57
2,mini_basic,68.42,52.91,39.01,0.57
3,mini_chain0,66.75,45.51,26.63,0.56
4,mini_chain1,66.68,45.69,27.04,0.56
5,mini_chain2,66.38,45.93,27.66,0.56
6,mini_chain3,66.35,49.41,33.76,0.57
7,mini_chain4,65.53,49.2,33.54,0.57
8,mini_chain5,65.23,49.33,33.83,0.56
9,mini_chain6,65.52,49.35,33.51,0.56


## ReadIT

In [12]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  d.append({
    'Corpus':             CORPUS_NAME,
    'readit_base':        round(df['readit_base'].mean() * 100, 2),
    'readit_lexical':     round(df['readit_lexical'].mean() * 100, 2),
    'readit_syntactic':   round(df['readit_syntactic'].mean() * 100, 2),
    'readit_global':      round(df['readit_global'].mean() * 100, 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,readit_base,readit_lexical,readit_syntactic,readit_global
0,original,77.29,74.31,83.27,92.27
1,basic,44.85,54.94,50.2,45.27
2,mini_basic,45.0,51.83,53.08,47.43
3,mini_chain0,78.66,73.62,82.28,91.97
4,mini_chain1,78.17,72.54,81.25,91.39
5,mini_chain2,77.6,68.36,80.09,89.15
6,mini_chain3,63.17,69.93,61.3,76.93
7,mini_chain4,64.75,64.62,59.72,70.11
8,mini_chain5,64.44,61.72,58.29,60.68
9,mini_chain6,63.76,63.41,58.57,58.98


## Semantic similarity

In [13]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  if CORPUS_NAME == 'original':
    continue
  d.append({
    'Corpus':               f'original vs {CORPUS_NAME}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,original vs basic,95.49
1,original vs mini_basic,96.04
2,original vs mini_chain0,99.78
3,original vs mini_chain1,99.71
4,original vs mini_chain2,99.54
5,original vs mini_chain3,99.21
6,original vs mini_chain4,99.04
7,original vs mini_chain5,98.84
8,original vs mini_chain6,98.83
9,original vs chain0,99.83


## Distance

In [14]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  if CORPUS_NAME == 'original':
    continue
  d.append({
    'Corpus':                 f'original vs {CORPUS_NAME}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,original vs basic,40631,2747,2324,84.6,5248,3460,65.93
1,original vs mini_basic,36625,2615,2157,82.49,4659,3078,66.07
2,original vs mini_chain0,650,145,57,39.31,174,69,39.66
3,original vs mini_chain1,1787,270,169,62.59,369,195,52.85
4,original vs mini_chain2,3076,411,295,71.78,537,291,54.19
5,original vs mini_chain3,5951,740,597,80.68,654,390,59.63
6,original vs mini_chain4,8940,821,676,82.34,716,431,60.2
7,original vs mini_chain5,10731,1075,883,82.14,995,653,65.63
8,original vs mini_chain6,11309,1170,952,81.37,1100,720,65.45
9,original vs chain0,629,115,46,40.0,132,55,41.67


In [15]:
d = []
for CORPUS_NAME, df in dfs_maps.items():
  if CORPUS_NAME == 'original':
    continue
  d.append({
    'Corpus':             f'original vs {CORPUS_NAME}',
    'editdistance':       round((df['editdistance'] / pd.concat([dfs_maps['original']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,original vs basic,52.52,34.47,29.29,56.11,37.2
1,original vs mini_basic,46.97,30.92,25.41,48.01,31.96
2,original vs mini_chain0,0.52,2.08,1.08,2.29,0.95
3,original vs mini_chain1,2.57,3.49,2.35,4.39,2.33
4,original vs mini_chain2,5.09,5.08,3.82,6.42,3.62
5,original vs mini_chain3,9.49,7.95,6.47,7.29,4.42
6,original vs mini_chain4,14.46,8.81,7.32,7.73,4.67
7,original vs mini_chain5,17.04,11.11,9.34,10.35,6.75
8,original vs mini_chain6,17.92,12.04,10.01,11.5,7.47
9,original vs chain0,0.52,1.58,0.81,1.73,0.78
