# Dependencies and imports

In [1]:
import json

from tqdm import tqdm
import pandas as pd

In [2]:
TEXTS_PREFIXES = ['original', 'reviewer1', 'reviewer2', 'gpt4']

# Utils

In [3]:
def merge_sets(sets):
  merged = set()
  for s in sets:
    merged = merged.union(s)
  return merged

# Load Data

In [4]:
dfs_maps = {
  'original': pd.read_csv(f"./corpora_with_metrics/original.csv", encoding='utf-8'),
  'reviewer1': pd.read_csv(f"./corpora_with_metrics/reviewer1.csv", encoding='utf-8'),
  'reviewer2': pd.read_csv(f"./corpora_with_metrics/reviewer2.csv", encoding='utf-8'),
  'gpt4': pd.read_csv(f"./corpora_with_metrics/gpt4.csv", encoding='utf-8'),
}
jsons_maps = {
  'original': json.load(open(f"./corpora_with_metrics/original.json", 'r')),
  'reviewer1': json.load(open(f"./corpora_with_metrics/reviewer1.json", 'r')),
  'reviewer2': json.load(open(f"./corpora_with_metrics/reviewer2.json", 'r')),
  'gpt4': json.load(open(f"./corpora_with_metrics/gpt4.json", 'r')),
}

# Basic

In [5]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'Tokens': df[f'{TEXT_PREFIX}_n_tokens'].sum(),
    'Tokens (con punteg.)': df[f'{TEXT_PREFIX}_n_tokens_all'].sum(),
    'Caratteri': df[f'{TEXT_PREFIX}_n_chars'].sum(),
    'Caratteri (con punt)': df[f'{TEXT_PREFIX}_n_chars_all'].sum(),
    'Sillabe': df[f'{TEXT_PREFIX}_n_syllables'].sum(),
    'Frasi': df[f'{TEXT_PREFIX}_n_sentences'].sum(),
    'Types': len(merge_sets([set(j[f'{TEXT_PREFIX}_tokens']) for j in jsons_maps[TEXT_PREFIX]])),
    'Lemmi': len(merge_sets([set(j[f'{TEXT_PREFIX}_lemmas']) for j in jsons_maps[TEXT_PREFIX]])),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi
0,original,33295,37429,191925,196071,79438,1314,5622,4096
1,reviewer1,34135,38193,181872,185945,76008,1506,5270,3640
2,reviewer2,29755,33933,166464,170654,69169,1744,5143,3693
3,gpt4,31722,36017,175147,179442,73110,1840,4930,3376


# Pos

In [6]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'Altro':                      df[f'{TEXT_PREFIX}_n_other'].sum(),
    'Nomi':                       df[f'{TEXT_PREFIX}_n_nouns'].sum(),
    'Verbi':                      df[f'{TEXT_PREFIX}_n_verbs'].sum(),
    'Numeri':                     df[f'{TEXT_PREFIX}_n_number'].sum(),
    'Simboli':                    df[f'{TEXT_PREFIX}_n_symbols'].sum(),
    'Avverbi':                    df[f'{TEXT_PREFIX}_n_adverbs'].sum(),
    'Articoli':                   df[f'{TEXT_PREFIX}_n_articles'].sum(),
    'Pronomi':                    df[f'{TEXT_PREFIX}_n_pronouns'].sum(),
    'Particelle':                 df[f'{TEXT_PREFIX}_n_particles'].sum(),
    'Aggettivi':                  df[f'{TEXT_PREFIX}_n_adjectives'].sum(),
    'Preposizioni':               df[f'{TEXT_PREFIX}_n_prepositions'].sum(),
    'Nomi propri':                df[f'{TEXT_PREFIX}_n_proper_nouns'].sum(),
    'Punteggiatura':              df[f'{TEXT_PREFIX}_n_punctuations'].sum(),
    'Interiezioni':               df[f'{TEXT_PREFIX}_n_interjections'].sum(),
    'Cong. coord.':               df[f'{TEXT_PREFIX}_n_coordinating_conjunctions'].sum(),
    'Cong. sub.':                 df[f'{TEXT_PREFIX}_n_subordinating_conjunctions'].sum(),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Aggettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,original,26,10206,3496,739,32,786,3313,658,0,3630,7786,1055,4089,0,1445,168
1,reviewer1,22,9493,5047,763,34,944,4039,1266,0,2858,6584,1163,4013,0,1543,424
2,reviewer2,22,8891,3975,707,33,796,3838,650,0,2740,5620,1021,4135,0,1196,309
3,gpt4,9,8484,5142,780,36,1035,4400,917,0,2794,5327,954,4264,0,1378,497


In [7]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'Altro':                      df[f'{TEXT_PREFIX}_n_other'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Nomi':                       df[f'{TEXT_PREFIX}_n_nouns'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Verbi':                      df[f'{TEXT_PREFIX}_n_verbs'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Numeri':                     df[f'{TEXT_PREFIX}_n_number'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Simboli':                    df[f'{TEXT_PREFIX}_n_symbols'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Avverbi':                    df[f'{TEXT_PREFIX}_n_adverbs'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Articoli':                   df[f'{TEXT_PREFIX}_n_articles'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Pronomi':                    df[f'{TEXT_PREFIX}_n_pronouns'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Particelle':                 df[f'{TEXT_PREFIX}_n_particles'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Aggettivi':                   df[f'{TEXT_PREFIX}_n_adjectives'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Preposizioni':               df[f'{TEXT_PREFIX}_n_prepositions'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Nomi propri':                df[f'{TEXT_PREFIX}_n_proper_nouns'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Punteggiatura':              df[f'{TEXT_PREFIX}_n_punctuations'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Interiezioni':               df[f'{TEXT_PREFIX}_n_interjections'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Congiunguonizi coordinati':  df[f'{TEXT_PREFIX}_n_coordinating_conjunctions'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
    'Cong. subordiante':          df[f'{TEXT_PREFIX}_n_subordinating_conjunctions'].sum() / df[f'{TEXT_PREFIX}_n_tokens_all'].sum() * 100,
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Aggettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Congiunguonizi coordinati,Cong. subordiante
0,original,0.069465,27.267627,9.340351,1.974405,0.085495,2.099976,8.851425,1.757995,0.0,9.698362,20.802052,2.81867,10.924684,0.0,3.860643,0.44885
1,reviewer1,0.057602,24.85534,13.214463,1.997748,0.089022,2.471657,10.575236,3.314744,0.0,7.483047,17.238761,3.045061,10.507161,0.0,4.040007,1.110151
2,reviewer2,0.064834,26.201633,11.71426,2.083518,0.09725,2.345799,11.310524,1.915539,0.0,8.074736,16.562049,3.00887,12.185778,0.0,3.524593,0.910618
3,gpt4,0.024988,23.555543,14.276592,2.165644,0.099953,2.873643,12.216453,2.54602,0.0,7.757448,14.790238,2.648749,11.838854,0.0,3.825971,1.379904


In [8]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  t =   df[f'{TEXT_PREFIX}_n_nouns'].sum() + \
        df[f'{TEXT_PREFIX}_n_verbs'].sum() + \
        df[f'{TEXT_PREFIX}_n_adjectives'].sum() + \
        df[f'{TEXT_PREFIX}_n_pronouns'].sum() + \
        df[f'{TEXT_PREFIX}_n_articles'].sum() + \
        df[f'{TEXT_PREFIX}_n_adverbs'].sum() + \
        df[f'{TEXT_PREFIX}_n_prepositions'].sum() + \
        df[f'{TEXT_PREFIX}_n_coordinating_conjunctions'].sum() + \
        df[f'{TEXT_PREFIX}_n_subordinating_conjunctions'].sum()
  d.append({
    'Reviewer': TEXT_PREFIX,
    'Nomi':                       df[f'{TEXT_PREFIX}_n_nouns'].sum() / t * 100,
    'Verbi':                      df[f'{TEXT_PREFIX}_n_verbs'].sum() / t * 100,
    'Aggettivi':                  df[f'{TEXT_PREFIX}_n_adjectives'].sum() / t * 100,
    'Pronomi':                    df[f'{TEXT_PREFIX}_n_pronouns'].sum() / t * 100,
    'Articoli':                   df[f'{TEXT_PREFIX}_n_articles'].sum() / t * 100,
    'Avverbi':                    df[f'{TEXT_PREFIX}_n_adverbs'].sum() / t * 100,
    'Preposizioni':               df[f'{TEXT_PREFIX}_n_prepositions'].sum() / t * 100,
    'Cong.':                      (df[f'{TEXT_PREFIX}_n_coordinating_conjunctions'].sum() + df[f'{TEXT_PREFIX}_n_subordinating_conjunctions'].sum()) / t * 100
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,Nomi,Verbi,Aggettivi,Pronomi,Articoli,Avverbi,Preposizioni,Cong.
0,original,32.412348,11.102642,11.528201,2.089685,10.521468,2.496189,24.72688,5.122586
1,reviewer1,29.483198,15.674887,8.876328,3.931921,12.544257,2.931859,20.448475,6.109075
2,reviewer2,31.73657,14.188827,9.780475,2.320186,13.699804,2.841335,20.060682,5.372122
3,gpt4,28.304531,17.154868,9.321412,3.059318,14.679389,3.452993,17.772069,6.255421


# Verbs

In [9]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'Verbi attivi': df[f'{TEXT_PREFIX}_n_active_verbs'].sum(),
    'Verbi passivi':  df[f'{TEXT_PREFIX}_n_passive_verbs'].sum()
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,Verbi attivi,Verbi passivi
0,original,2684,812
1,reviewer1,4124,923
2,reviewer2,3184,791
3,gpt4,4450,692


In [10]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'Verbi attivi': df[f'{TEXT_PREFIX}_n_active_verbs'].sum() / df[f'{TEXT_PREFIX}_n_verbs'].sum() * 100,
    'Verbi passivi':  df[f'{TEXT_PREFIX}_n_passive_verbs'].sum() / df[f'{TEXT_PREFIX}_n_verbs'].sum() * 100,
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,Verbi attivi,Verbi passivi
0,original,76.773455,23.226545
1,reviewer1,81.711908,18.288092
2,reviewer2,80.100629,19.899371
3,gpt4,86.542201,13.457799


# VdB

In [11]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'ALL': df[f'{TEXT_PREFIX}_n_vdb'].sum(),
    'FO':  df[f'{TEXT_PREFIX}_n_vdb_fo'].sum(),
    'AU': df[f'{TEXT_PREFIX}_n_vdb_au'].sum(),
    'AD': df[f'{TEXT_PREFIX}_n_vdb_ad'].sum(),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,ALL,FO,AU,AD
0,original,24185,20113,4205,3823
1,reviewer1,27155,24097,3174,3553
2,reviewer2,22780,19561,3196,2794
3,gpt4,25513,22549,2987,2931


In [12]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'ALL': df[f'{TEXT_PREFIX}_n_vdb'].sum() / df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100,
    'FO':  df[f'{TEXT_PREFIX}_n_vdb_fo'].sum() / df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100,
    'AU': df[f'{TEXT_PREFIX}_n_vdb_au'].sum() / df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100,
    'AD': df[f'{TEXT_PREFIX}_n_vdb_ad'].sum() / df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100,
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,ALL,FO,AU,AD
0,original,72.638534,60.40847,12.629524,11.482205
1,reviewer1,79.55178,70.593233,9.298374,10.408671
2,reviewer2,76.558562,65.740212,10.741052,9.390018
3,gpt4,80.426833,71.08316,9.416178,9.239644


# Readability

In [13]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': TEXT_PREFIX,
    'ttr': round(df[f'{TEXT_PREFIX}_ttr'].mean(), 2),
    'gulpease_index': round(df[f'{TEXT_PREFIX}_gulpease'].mean(), 2),
    'flesch_vacca': round(df[f'{TEXT_PREFIX}_flesch_vacca'].mean(), 2),
    'lexical_density': round(df[f'{TEXT_PREFIX}_lexical_density'].mean(), 2)
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,ttr,gulpease_index,flesch_vacca,lexical_density
0,original,86.15,44.31,19.97,0.55
1,reviewer1,84.56,49.72,34.23,0.54
2,reviewer2,86.3,50.64,33.63,0.56
3,gpt4,86.64,51.34,36.75,0.56


# Similarity

In [14]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  if TEXT_PREFIX == 'original':
    continue
  d.append({
    'Reviewer': f'Original vs {TEXT_PREFIX}',
    'semantic_similarity': round(df[f'{TEXT_PREFIX}_semantic_similarity'].mean(), 2)
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,semantic_similarity
0,Original vs reviewer1,83.91
1,Original vs reviewer2,87.87
2,Original vs gpt4,80.02


# Diff

In [15]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  if TEXT_PREFIX == 'original':
    continue
  d.append({
    'Reviewer': f'Original vs {TEXT_PREFIX}',
    'editdistance': df[f'{TEXT_PREFIX}_editdistance'].sum(),
    'added_tokens': df[f'{TEXT_PREFIX}_n_added_tokens'].sum(),
    'added_vdb_tokens': df[f'{TEXT_PREFIX}_n_added_vdb_tokens'].sum(),
    '%_added_vdb_tokens': round(df[f'{TEXT_PREFIX}_n_added_vdb_tokens'].sum() / df[f'{TEXT_PREFIX}_n_added_tokens'].sum() * 100, 2),
    'deleted_tokens': df[f'{TEXT_PREFIX}_n_deleted_tokens'].sum(),
    'deleted_vdb_tokens': df[f'{TEXT_PREFIX}_n_deleted_vdb_tokens'].sum(),
    '%_deleted_vdb_tokens': round(df[f'{TEXT_PREFIX}_n_deleted_vdb_tokens'].sum() / df[f'{TEXT_PREFIX}_n_deleted_tokens'].sum() * 100, 2),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,Original vs reviewer1,67468,10256,9044,88.18,9838,6629,67.38
1,Original vs reviewer2,63642,6627,5776,87.16,9552,6778,70.96
2,Original vs gpt4,105741,13788,11952,86.68,14864,10456,70.34


In [16]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  if TEXT_PREFIX == 'original':
    continue
  d.append({
    'Reviewer': f'Original vs {TEXT_PREFIX}',
    'editdistance': round(df[f'{TEXT_PREFIX}_editdistance'].sum() /  df[f'{TEXT_PREFIX}_n_chars'].sum() * 100, 2),
    'added_tokens': round(df[f'{TEXT_PREFIX}_n_added_tokens'].sum() /  df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100, 2),
    'added_vdb_tokens': round(df[f'{TEXT_PREFIX}_n_added_vdb_tokens'].sum() /  df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100, 2),
    'deleted_tokens': round(df[f'{TEXT_PREFIX}_n_deleted_tokens'].sum() /  df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100, 2),
    'deleted_vdb_tokens': round(df[f'{TEXT_PREFIX}_n_deleted_vdb_tokens'].sum() /  df[f'{TEXT_PREFIX}_n_tokens'].sum() * 100, 2),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,Original vs reviewer1,37.1,30.05,26.49,28.82,19.42
1,Original vs reviewer2,38.23,22.27,19.41,32.1,22.78
2,Original vs gpt4,60.37,43.47,37.68,46.86,32.96


# Time & Efficiency

In [17]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  d.append({
    'Reviewer': f'{TEXT_PREFIX}',
    'reading_time_s': df[f'{TEXT_PREFIX}_reading_time'].sum(),
    'reading_time_m': df[f'{TEXT_PREFIX}_reading_time'].sum() / 60,
    'reading_time_h': df[f'{TEXT_PREFIX}_reading_time'].sum() / 60 / 60,
    'reading_time_mean_s': df[f'{TEXT_PREFIX}_reading_time'].mean(),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,reading_time_s,reading_time_m,reading_time_h,reading_time_mean_s
0,original,19977.0,332.95,5.549167,32.273021
1,reviewer1,20481.0,341.35,5.689167,33.087237
2,reviewer2,17853.0,297.55,4.959167,28.84168
3,gpt4,19033.2,317.22,5.287,30.748304


In [18]:
d = []
for TEXT_PREFIX, df in dfs_maps.items():
  if TEXT_PREFIX == 'original' or TEXT_PREFIX == 'gpt4':
    continue
  d.append({
    'Reviewer': f'{TEXT_PREFIX}',
    'time_s': df[f'{TEXT_PREFIX}_time'].sum(),
    'time_m': df[f'{TEXT_PREFIX}_time'].sum() / 60,
    'time_h': df[f'{TEXT_PREFIX}_time'].sum() / 60 / 60,
    'efficiency': (dfs_maps['original'][f'original_reading_time'].sum() + df[f'{TEXT_PREFIX}_reading_time'].sum()) / df[f'{TEXT_PREFIX}_time'].sum(),
    'efficiency_mean': df[f'{TEXT_PREFIX}_efficiency'].mean(),
  })

d = pd.DataFrame(d)
d.head(10)

Unnamed: 0,Reviewer,time_s,time_m,time_h,efficiency,efficiency_mean
0,reviewer1,55927.0,932.116667,15.535278,0.723407,3.442681
1,reviewer2,82869.0,1381.15,23.019167,0.456504,0.988165
