## Setup

In [1]:
import json

from tqdm import tqdm
import pandas as pd

from italian_ats_evaluator import TextAnalyzer, SimplificationAnalyzer

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
TEXTS = ['human1', 'human2', 'gpt3_5', 'gpt4', 'gemini', 'llama3', 'phi3', 'semplit_mt5', 'semplit_umt5', 'semplit_gpt2_small_italian']

## Load datasets

In [3]:
original_df = pd.read_csv(f"./texts/original.csv", encoding="utf-8")
original_df = original_df[['original_text', 'document', 'paragraph_index']]
original_df = original_df.sort_values(by=['document', 'paragraph_index'])
original_df.shape

(619, 3)

In [4]:
dfs_map = dict()
for TEXT in TEXTS:
  print(TEXT)
  tmp_df = pd.read_csv(f"./texts/{TEXT}.csv", encoding="utf-8")
  tmp_df = tmp_df[['original_text', 'document', 'paragraph_index', 'simplified_text']]
  tmp_df = tmp_df[(tmp_df['document'] != '99bdc9fdd8097f067f77cb220074b1b5') | (tmp_df['paragraph_index'] <= 80)]
  tmp_df = tmp_df.rename(columns={"simplified_text": f"{TEXT}_text"})
  tmp_df = tmp_df.sort_values(by=['document', 'paragraph_index'])
  print(tmp_df.shape)
  dfs_map[TEXT] = tmp_df

human1
(619, 4)
human2
(619, 4)
gpt3_5
(619, 4)
gpt4
(619, 4)
gemini
(619, 4)
llama3
(619, 4)
phi3
(619, 4)
semplit_mt5
(619, 4)
semplit_umt5
(619, 4)
semplit_gpt2_small_italian
(619, 4)


# Metrics

In [5]:
original_metrics = []
original_raw_data = []
for row in tqdm(original_df.to_dict(orient="records")):
  result = TextAnalyzer(row["original_text"])
  original_metrics.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    # Basic
    'original_n_tokens': result.basic.n_tokens,
    'original_n_tokens_all': result.basic.n_tokens_all,
    'original_n_chars': result.basic.n_chars,
    'original_n_chars_all': result.basic.n_chars_all,
    'original_n_syllables': result.basic.n_syllables,
    'original_n_words': result.basic.n_words,
    'original_n_unique_lemmas': result.basic.n_unique_lemmas,
    'original_n_sentences': result.basic.n_sentences,
    # Pos
    'original_n_other': result.pos.n_other,
    'original_n_nouns': result.pos.n_nouns,
    'original_n_verbs': result.pos.n_verbs,
    'original_n_number': result.pos.n_number,
    'original_n_symbols': result.pos.n_symbols,
    'original_n_adverbs': result.pos.n_adverbs,
    'original_n_articles': result.pos.n_articles,
    'original_n_pronouns': result.pos.n_pronouns,
    'original_n_particles': result.pos.n_particles,
    'original_n_adjectives': result.pos.n_adjectives,
    'original_n_prepositions': result.pos.n_prepositions,
    'original_n_proper_nouns': result.pos.n_proper_nouns,
    'original_n_punctuations': result.pos.n_punctuations,
    'original_n_interjections': result.pos.n_interjections,
    'original_n_coordinating_conjunctions': result.pos.n_coordinating_conjunctions,
    'original_n_subordinating_conjunctions': result.pos.n_subordinating_conjunctions,
    # Verbs
    'original_n_active_verbs': result.verbs.n_active_verbs,
    'original_n_passive_verbs': result.verbs.n_passive_verbs,
    # Readability
    'original_ttr': result.readability.ttr,
    'original_gulpease': result.readability.gulpease,
    'original_flesch_vacca': result.readability.flesch_vacca,
    'original_lexical_density': result.readability.lexical_density,
    # VdB
    'original_n_vdb': result.vdb.n_vdb_tokens,
    'original_n_vdb_fo': result.vdb.n_vdb_fo_tokens,
    'original_n_vdb_au': result.vdb.n_vdb_au_tokens,
    'original_n_vdb_ad': result.vdb.n_vdb_ad_tokens,
  })
  original_raw_data.append({
      'document': row['document'],
      'paragraph_index': row['paragraph_index'],
      'original_text': row['original_text'],
      'original_tokens': result.basic.tokens,
      'original_lemmas': result.basic.lemmas
  })

100%|██████████| 619/619 [00:05<00:00, 108.52it/s]


In [6]:
simplified_metrics = {TEXT:[] for TEXT in TEXTS}
simplified_raw_data = {TEXT:[] for TEXT in TEXTS}
for TEXT in TEXTS:
  print(TEXT)
  for row in tqdm(dfs_map[TEXT].to_dict(orient="records")):
    result = SimplificationAnalyzer(row["original_text"], row[f"{TEXT}_text"])

    simplified_metrics[TEXT].append({
      'document': row['document'],
      'paragraph_index': row['paragraph_index'],
      'original_text': row['original_text'],
      f'{TEXT}_text': row[f'{TEXT}_text'],
      # Basic
      f'{TEXT}_n_tokens': result.simplified.basic.n_tokens,
      f'{TEXT}_n_tokens_all': result.simplified.basic.n_tokens_all,
      f'{TEXT}_n_chars': result.simplified.basic.n_chars,
      f'{TEXT}_n_chars_all': result.simplified.basic.n_chars_all,
      f'{TEXT}_n_syllables': result.simplified.basic.n_syllables,
      f'{TEXT}_n_words': result.simplified.basic.n_words,
      f'{TEXT}_n_unique_lemmas': result.simplified.basic.n_unique_lemmas,
      f'{TEXT}_n_sentences': result.simplified.basic.n_sentences,
      # Pos
      f'{TEXT}_n_other': result.simplified.pos.n_other,
      f'{TEXT}_n_nouns': result.simplified.pos.n_nouns,
      f'{TEXT}_n_verbs': result.simplified.pos.n_verbs,
      f'{TEXT}_n_number': result.simplified.pos.n_number,
      f'{TEXT}_n_symbols': result.simplified.pos.n_symbols,
      f'{TEXT}_n_adverbs': result.simplified.pos.n_adverbs,
      f'{TEXT}_n_articles': result.simplified.pos.n_articles,
      f'{TEXT}_n_pronouns': result.simplified.pos.n_pronouns,
      f'{TEXT}_n_particles': result.simplified.pos.n_particles,
      f'{TEXT}_n_adjectives': result.simplified.pos.n_adjectives,
      f'{TEXT}_n_prepositions': result.simplified.pos.n_prepositions,
      f'{TEXT}_n_proper_nouns': result.simplified.pos.n_proper_nouns,
      f'{TEXT}_n_punctuations': result.simplified.pos.n_punctuations,
      f'{TEXT}_n_interjections': result.simplified.pos.n_interjections,
      f'{TEXT}_n_coordinating_conjunctions': result.simplified.pos.n_coordinating_conjunctions,
      f'{TEXT}_n_subordinating_conjunctions': result.simplified.pos.n_subordinating_conjunctions,
      # Verbs
      f'{TEXT}_n_active_verbs': result.simplified.verbs.n_active_verbs,
      f'{TEXT}_n_passive_verbs': result.simplified.verbs.n_passive_verbs,
      # Readability
      f'{TEXT}_ttr': result.simplified.readability.ttr,
      f'{TEXT}_gulpease': result.simplified.readability.gulpease,
      f'{TEXT}_flesch_vacca': result.simplified.readability.flesch_vacca,
      f'{TEXT}_lexical_density': result.simplified.readability.lexical_density,
      # VdB
      f'{TEXT}_n_vdb': result.simplified.vdb.n_vdb_tokens,
      f'{TEXT}_n_vdb_fo': result.simplified.vdb.n_vdb_fo_tokens,
      f'{TEXT}_n_vdb_au': result.simplified.vdb.n_vdb_au_tokens,
      f'{TEXT}_n_vdb_ad': result.simplified.vdb.n_vdb_ad_tokens,
      # Similariy
      f'{TEXT}_semantic_similarity': result.similarity.semantic_similarity,
      # Diff
      f'{TEXT}_editdistance': result.diff.editdistance,
      f'{TEXT}_n_added_tokens': result.diff.n_added_tokens,
      f'{TEXT}_n_deleted_tokens': result.diff.n_deleted_tokens,
      f'{TEXT}_n_added_vdb_tokens': result.diff.n_added_vdb_tokens,
      f'{TEXT}_n_deleted_vdb_tokens': result.diff.n_deleted_vdb_tokens,
    })
    simplified_raw_data[TEXT].append({
      'document': row['document'],
      'paragraph_index': row['paragraph_index'],
      'original_text': row['original_text'],
      f'{TEXT}_text': row[f'{TEXT}_text'],
      f'{TEXT}_tokens': result.simplified.basic.tokens,
      f'{TEXT}_lemmas': result.simplified.basic.lemmas
    })


human1


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
100%|██████████| 619/619 [00:26<00:00, 23.54it/s]


human2


100%|██████████| 619/619 [00:22<00:00, 27.83it/s]


gpt3_5


100%|██████████| 619/619 [00:24<00:00, 25.16it/s]


gpt4


100%|██████████| 619/619 [00:23<00:00, 26.22it/s]


gemini


100%|██████████| 619/619 [00:24<00:00, 24.82it/s]


llama3


100%|██████████| 619/619 [00:24<00:00, 25.79it/s]


phi3


100%|██████████| 619/619 [00:23<00:00, 26.08it/s]


semplit_mt5


100%|██████████| 619/619 [00:22<00:00, 27.62it/s]


semplit_umt5


100%|██████████| 619/619 [00:22<00:00, 27.68it/s]


semplit_gpt2_small_italian


100%|██████████| 619/619 [00:23<00:00, 26.84it/s]


# Save

In [7]:
pd.DataFrame(original_metrics).to_csv(f"./texts_with_metrics/original.csv", index=False)
json.dump(original_raw_data, open(f"./texts_with_metrics/original.json", 'w', encoding="utf-8"))

In [8]:
for TEXT in TEXTS:
  print(TEXT)
  pd.DataFrame(simplified_metrics[TEXT]).to_csv(f"./texts_with_metrics/{TEXT}.csv", index=False)
  json.dump(simplified_raw_data[TEXT], open(f"./texts_with_metrics/{TEXT}.json", 'w', encoding="utf-8"))

human1
human2
gpt3_5
gpt4
gemini
llama3
phi3
semplit_mt5
semplit_umt5
semplit_gpt2_small_italian
