## Setup

In [1]:
import json
import pandas as pd

# Utils

In [2]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

## Load datasets

In [3]:
TEXTS = ['text']

In [4]:
metrics_dfs = {TEXT:[] for TEXT in TEXTS}
raw_data = {TEXT:[] for TEXT in TEXTS}

for TEXT in TEXTS:
    metrics_dfs[TEXT] = pd.read_csv(f'./metrics/text/{TEXT}_metrics.csv')
    raw_data[TEXT] = json.load(open(f'./metrics/text/{TEXT}_raw_data.json'))

# Basic

In [5]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':               TEXT,
    'Tokens':               df['n_tokens'].sum(),
    'Tokens (con punteg.)': df['n_tokens_all'].sum(),
    'Caratteri':            df['n_chars'].sum(),
    'Caratteri (con punt)': df['n_chars_all'].sum(),
    'Sillabe':              df['n_syllables'].sum(),
    'Frasi':                df['n_sentences'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Lemmi':                len(merge_sets([set(j['lemmas']) for j in raw_data[TEXT]])),
    'Tokens per frase':     df['n_tokens'].sum() / df['n_sentences'].sum(),
    'Tokens per paragrafo': df['n_tokens'].sum() / df.shape[0],
    'Frasi per paragrafo':  df['n_sentences'].sum() / df.shape[0],
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi,Tokens per frase,Tokens per paragrafo,Frasi per paragrafo
0,text,2697,3042,14587,14932,6041,89,915,752,30.303371,158.647059,5.235294


# Pos

In [6]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':        TEXT,
    'Altro':         df['n_other'].sum(),
    'Nomi':          df['n_nouns'].sum(),
    'Verbi':         df['n_verbs'].sum(),
    'Numeri':        df['n_number'].sum(),
    'Simboli':       df['n_symbols'].sum(),
    'Avverbi':       df['n_adverbs'].sum(),
    'Articoli':      df['n_articles'].sum(),
    'Pronomi':       df['n_pronouns'].sum(),
    'Particelle':    df['n_particles'].sum(),
    'Agettivi':      df['n_adjectives'].sum(),
    'Preposizioni':  df['n_prepositions'].sum(),
    'Nomi propri':   df['n_proper_nouns'].sum(),
    'Punteggiatura': df['n_punctuations'].sum(),
    'Interiezioni':  df['n_interjections'].sum(),
    'Cong. coord.':  df['n_coordinating_conjunctions'].sum(),
    'Cong. sub.':    df['n_subordinating_conjunctions'].sum(),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,text,2,761,263,89,2,57,257,50,0,248,577,169,342,0,80,14


In [7]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':              TEXT,
    'Altro':               round((df['n_other'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi':                round((df['n_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Verbi':               round((df['n_verbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Numeri':              round((df['n_number'] / df['n_tokens_all']).mean() * 100, 2),
    'Simboli':             round((df['n_symbols'] / df['n_tokens_all']).mean() * 100, 2),
    'Avverbi':             round((df['n_adverbs'] / df['n_tokens_all']).mean() * 100, 2),
    'Articoli':            round((df['n_articles'] / df['n_tokens_all']).mean() * 100, 2),
    'Pronomi':             round((df['n_pronouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Particelle':          round((df['n_particles'] / df['n_tokens_all']).mean() * 100, 2),
    'Agettivi':            round((df['n_adjectives'] / df['n_tokens_all']).mean() * 100, 2),
    'Preposizioni':        round((df['n_prepositions'] / df['n_tokens_all']).mean() * 100, 2),
    'Nomi propri':         round((df['n_proper_nouns'] / df['n_tokens_all']).mean() * 100, 2),
    'Punteggiatura':       round((df['n_punctuations'] / df['n_tokens_all']).mean() * 100, 2),
    'Interiezioni':        round((df['n_interjections'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. coordinati':    round((df['n_coordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
    'Cong. subordiante':   round((df['n_subordinating_conjunctions'] / df['n_tokens_all']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coordinati,Cong. subordiante
0,text,0.05,25.25,8.57,3.04,0.04,1.94,8.95,1.81,0.0,8.06,18.95,5.33,10.72,0.0,2.83,0.67


# Passive

In [8]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   df['n_active_verbs'].sum(),
    'Verbi passivi':  df['n_passive_verbs'].sum()
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,178,19


In [9]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':         TEXT,
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'Verbi passivi':  round((df['n_passive_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,Verbi attivi,Verbi passivi
0,text,67.77,7.15


# NVdB

In [10]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'ALL':      df['n_vdb'].sum(),
    'FO':       df['n_vdb_fo'].sum(),
    'AU':       df['n_vdb_au'].sum(),
    'AD':       df['n_vdb_ad'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,1790,1524,261,262


In [11]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus': TEXT,
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'FO':     round((df['n_vdb_fo'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AU':     round((df['n_vdb_au'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'AD':     round((df['n_vdb_ad'] / df['n_tokens']).fillna(0).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ALL,FO,AU,AD
0,text,67.15,57.24,9.69,9.89


## Expressions

In [12]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'latinisms':              df['n_latinisms'].sum(),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,difficult_connectives,latinisms
0,text,19,0


In [13]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':   TEXT,
    'unique_difficult_connectives': len(merge_sets([set(j['difficult_connectives']) for j in raw_data[TEXT]])),
    'unique_latinisms':       len(merge_sets([set(j['latinisms']) for j in raw_data[TEXT]])),
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,unique_difficult_connectives,unique_latinisms
0,text,15,0


# Readability

In [14]:
d = []
for TEXT, df in metrics_dfs.items():
  d.append({
    'Corpus':             TEXT,
    'ttr':                round(df['ttr'].mean(), 2),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2),
    'lexical_density':    round(df['lexical_density'].mean(), 2)
  })

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,ttr,gulpease_index,flesch_vacca,lexical_density
0,text,69.26,44.41,24.15,49.08


# Semantic similarity

In [15]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':               f'original vs {TEXT}',
    'semantic_similarity':  round(df['semantic_similarity'].mean(), 2)
  })

pd.DataFrame(d).head(20)

# Distance

In [16]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':                 f'original vs {TEXT}',
    'editdistance':           df['editdistance'].sum(),
    'added_tokens':           df['n_added_tokens'].sum(),
    'added_vdb_tokens':       df['n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens':     round(df['n_added_vdb_tokens'].sum() / df['n_added_tokens'].sum() * 100, 2),
    'deleted_tokens':         df['n_deleted_tokens'].sum(),
    'deleted_vdb_tokens':     df['n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens':   round(df['n_deleted_vdb_tokens'].sum() / df['n_deleted_tokens'].sum() * 100, 2),
  })

pd.DataFrame(d).head(20)

In [17]:
d = []
for TEXT, df in metrics_dfs.items():
  if TEXT == 'text':
    continue
  d.append({
    'Corpus':             f'original vs {TEXT}',
    'editdistance':       round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2),
    'added_tokens':       round((df['n_added_tokens'] / df['n_tokens']).mean() * 100, 2),
    'added_vdb_tokens':   round((df['n_added_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_tokens':     round((df['n_deleted_tokens'] /  df['n_tokens']).mean() * 100, 2),
    'deleted_vdb_tokens': round((df['n_deleted_vdb_tokens'] /  df['n_tokens']).mean() * 100, 2),
  })

pd.DataFrame(d).head(20)

# Slide

In [18]:
all = []
for TEXT, df in metrics_dfs.items():
  d = {        
    'Corpus':             f'{TEXT}',
    'Tokens':               df['n_tokens'].sum(),
    'Types':                len(merge_sets([set(j['tokens']) for j in raw_data[TEXT]])),
    'Frasi':                df['n_sentences'].sum(),
    'Verbi attivi':   round((df['n_active_verbs'] / df['n_verbs']).fillna(0).mean() * 100, 2),
    'ALL':    round((df['n_vdb'] / df['n_tokens']).fillna(0).mean() * 100, 2),
    'difficult_connectives':  df['n_difficult_connectives'].sum(),
    'gulpease_index':     round(df['gulpease'].mean(), 2),
    'flesch_vacca':       round(df['flesch_vacca'].mean(), 2)
  }
  if TEXT != 'text':
    d['semantic_similarity'] = round(df['semantic_similarity'].mean(), 2)
    d['editdistance'] = round((df['editdistance'] / pd.concat([metrics_dfs['text']['n_chars'], df['n_chars']], axis=1).max(axis=1)).mean() * 100, 2)
  all.append(d)
  
pd.DataFrame(all).head(10)

Unnamed: 0,Corpus,Tokens,Types,Frasi,Verbi attivi,ALL,difficult_connectives,gulpease_index,flesch_vacca
0,text,2697,915,89,67.77,67.15,19,44.41,24.15
