# Merge

## Dependencies and imports

In [1]:
import json

import pandas as pd
from tqdm import tqdm

from italian_ats_evaluator import TextAnalyzer, SimplificationAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


## Utils

In [2]:
def reading_time(n_tokens):
  reading_speed = 100.0 / 60.0 # words per second
  return round(n_tokens / reading_speed, 2)

def efficiency(original_reading_time, simplified_reading_time, simplification_time):
  total_reading_time = original_reading_time + simplified_reading_time
  simplification_time = simplification_time

  if simplification_time == 0:
    return 999
  else:
    return round(total_reading_time / simplification_time, 6)

## Load Data

In [3]:
paragraphs_df = pd.read_csv(f"corpora/original.csv", encoding='utf-8')
reviewer_1_df = pd.read_csv(f"corpora/reviewer1.csv", encoding='utf-8')
reviewer_2_df = pd.read_csv(f"corpora/reviewer2.csv", encoding='utf-8')
gpt4_df = pd.read_csv(f"corpora/gpt4.csv", encoding='utf-8')

In [4]:
paragraphs_df.shape, reviewer_1_df.shape, reviewer_2_df.shape, gpt4_df.shape

((878, 3), (878, 8), (878, 8), (619, 4))

## Clean Data

In [5]:
paragraphs_df = paragraphs_df[(paragraphs_df['document'] != '99bdc9fdd8097f067f77cb220074b1b5') | (paragraphs_df['paragraph_index'] <= 80)]
reviewer_1_df = reviewer_1_df[(reviewer_1_df['document'] != '99bdc9fdd8097f067f77cb220074b1b5') | (reviewer_1_df['paragraph_index'] <= 80)]
reviewer_2_df = reviewer_2_df[(reviewer_2_df['document'] != '99bdc9fdd8097f067f77cb220074b1b5') | (reviewer_2_df['paragraph_index'] <= 80)]
gpt4_df = gpt4_df[(gpt4_df['document'] != '99bdc9fdd8097f067f77cb220074b1b5') | (gpt4_df['paragraph_index'] <= 80)]

In [6]:
reviewer_1_df = reviewer_1_df.drop(columns=['tagging_date'])
reviewer_2_df = reviewer_2_df.drop(columns=['tagging_date'])

In [7]:
reviewer_1_df = reviewer_1_df.rename(columns={"simplified_text": "reviewer1_text", 'original_text_isssues_detected': 'reviewer1_issues', 'simplification_rules_applied': 'reviewer1_rules', 'tagging_elapsed_time': 'reviewer1_time'})
reviewer_2_df = reviewer_2_df.rename(columns={"simplified_text": "reviewer2_text", 'original_text_isssues_detected': 'reviewer2_issues', 'simplification_rules_applied': 'reviewer2_rules', 'tagging_elapsed_time': 'reviewer2_time'})
gpt4_df = gpt4_df.rename(columns={"simplified_text": "gpt4_text"})

In [8]:
reviewer_1_df.shape, reviewer_2_df.shape, gpt4_df.shape

((619, 7), (619, 7), (619, 4))

## Original

In [9]:
original_metrics = []
original_raw_data = []
for row in tqdm(paragraphs_df.to_dict(orient="records")):
  result = TextAnalyzer(row["original_text"])
  original_metrics.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    # Basic
    'original_n_tokens': result.basic.n_tokens,
    'original_n_tokens_all': result.basic.n_tokens_all,
    'original_n_chars': result.basic.n_chars,
    'original_n_chars_all': result.basic.n_chars_all,
    'original_n_syllables': result.basic.n_syllables,
    'original_n_words': result.basic.n_words,
    'original_n_unique_lemmas': result.basic.n_unique_lemmas,
    'original_n_sentences': result.basic.n_sentences,
    # Pos
    'original_n_other': result.pos.n_other,
    'original_n_nouns': result.pos.n_nouns,
    'original_n_verbs': result.pos.n_verbs,
    'original_n_number': result.pos.n_number,
    'original_n_symbols': result.pos.n_symbols,
    'original_n_adverbs': result.pos.n_adverbs,
    'original_n_articles': result.pos.n_articles,
    'original_n_pronouns': result.pos.n_pronouns,
    'original_n_particles': result.pos.n_particles,
    'original_n_adjectives': result.pos.n_adjectives,
    'original_n_prepositions': result.pos.n_prepositions,
    'original_n_proper_nouns': result.pos.n_proper_nouns,
    'original_n_punctuations': result.pos.n_punctuations,
    'original_n_interjections': result.pos.n_interjections,
    'original_n_coordinating_conjunctions': result.pos.n_coordinating_conjunctions,
    'original_n_subordinating_conjunctions': result.pos.n_subordinating_conjunctions,
    # Verbs
    'original_n_active_verbs': result.verbs.n_active_verbs,
    'original_n_passive_verbs': result.verbs.n_passive_verbs,
    # Readability
    'original_ttr': result.readability.ttr,
    'original_gulpease': result.readability.gulpease,
    'original_flesch_vacca': result.readability.flesch_vacca,
    'original_lexical_density': result.readability.lexical_density,
    # VdB
    'original_n_vdb': result.vdb.n_vdb_tokens,
    'original_n_vdb_fo': result.vdb.n_vdb_fo_tokens,
    'original_n_vdb_au': result.vdb.n_vdb_au_tokens,
    'original_n_vdb_ad': result.vdb.n_vdb_ad_tokens,
    # Time
    'original_reading_time': reading_time(result.basic.n_tokens),
  })
  original_raw_data.append({
      'document': row['document'],
      'paragraph_index': row['paragraph_index'],
      'original_text': row['original_text'],
      'original_tokens': result.basic.tokens,
      'original_lemmas': result.basic.lemmas
  })

  0%|          | 0/619 [00:00<?, ?it/s]

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


100%|██████████| 619/619 [00:27<00:00, 22.52it/s] 


In [10]:
pd.DataFrame(original_metrics).to_csv(f"./corpora_with_metrics/original.csv", encoding="utf-8", index=False)
json.dump(original_raw_data, open(f"./corpora_with_metrics/original.json", 'w', encoding="utf-8"))

## Reviewer1

In [11]:
reviewer1_metrics = []
reviewer1_raw_data = []
for row in tqdm(reviewer_1_df.to_dict(orient="records")):
  result = SimplificationAnalyzer(row["original_text"], row[f"reviewer1_text"])
  reviewer1_metrics.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    'reviewer1_text': row[f'reviewer1_text'],
    # Basic
    'reviewer1_n_tokens': result.simplified.basic.n_tokens,
    'reviewer1_n_tokens_all': result.simplified.basic.n_tokens_all,
    'reviewer1_n_chars': result.simplified.basic.n_chars,
    'reviewer1_n_chars_all': result.simplified.basic.n_chars_all,
    'reviewer1_n_syllables': result.simplified.basic.n_syllables,
    'reviewer1_n_words': result.simplified.basic.n_words,
    'reviewer1_n_unique_lemmas': result.simplified.basic.n_unique_lemmas,
    'reviewer1_n_sentences': result.simplified.basic.n_sentences,
    # Pos
    'reviewer1_n_other': result.simplified.pos.n_other,
    'reviewer1_n_nouns': result.simplified.pos.n_nouns,
    'reviewer1_n_verbs': result.simplified.pos.n_verbs,
    'reviewer1_n_number': result.simplified.pos.n_number,
    'reviewer1_n_symbols': result.simplified.pos.n_symbols,
    'reviewer1_n_adverbs': result.simplified.pos.n_adverbs,
    'reviewer1_n_articles': result.simplified.pos.n_articles,
    'reviewer1_n_pronouns': result.simplified.pos.n_pronouns,
    'reviewer1_n_particles': result.simplified.pos.n_particles,
    'reviewer1_n_adjectives': result.simplified.pos.n_adjectives,
    'reviewer1_n_prepositions': result.simplified.pos.n_prepositions,
    'reviewer1_n_proper_nouns': result.simplified.pos.n_proper_nouns,
    'reviewer1_n_punctuations': result.simplified.pos.n_punctuations,
    'reviewer1_n_interjections': result.simplified.pos.n_interjections,
    'reviewer1_n_coordinating_conjunctions': result.simplified.pos.n_coordinating_conjunctions,
    'reviewer1_n_subordinating_conjunctions': result.simplified.pos.n_subordinating_conjunctions,
    # Verbs
    'reviewer1_n_active_verbs': result.simplified.verbs.n_active_verbs,
    'reviewer1_n_passive_verbs': result.simplified.verbs.n_passive_verbs,
    # Readability
    'reviewer1_ttr': result.simplified.readability.ttr,
    'reviewer1_gulpease': result.simplified.readability.gulpease,
    'reviewer1_flesch_vacca': result.simplified.readability.flesch_vacca,
    'reviewer1_lexical_density': result.simplified.readability.lexical_density,
    # VdB
    'reviewer1_n_vdb': result.simplified.vdb.n_vdb_tokens,
    'reviewer1_n_vdb_fo': result.simplified.vdb.n_vdb_fo_tokens,
    'reviewer1_n_vdb_au': result.simplified.vdb.n_vdb_au_tokens,
    'reviewer1_n_vdb_ad': result.simplified.vdb.n_vdb_ad_tokens,
    # Similariy
    'reviewer1_semantic_similarity': result.similarity.semantic_similarity,
    # Diff
    'reviewer1_editdistance': result.diff.editdistance,
    'reviewer1_n_added_tokens': result.diff.n_added_tokens,
    'reviewer1_n_deleted_tokens': result.diff.n_deleted_tokens,
    'reviewer1_n_added_vdb_tokens': result.diff.n_added_vdb_tokens,
    'reviewer1_n_deleted_vdb_tokens': result.diff.n_deleted_vdb_tokens,
    # Time & efficiency
    'reviewer1_reading_time': reading_time(result.simplified.basic.n_tokens),
    'reviewer1_time': row[f'reviewer1_time'],
    'reviewer1_efficiency': efficiency(
        original_reading_time=reading_time(result.reference.basic.n_tokens),
        simplified_reading_time=reading_time(result.simplified.basic.n_tokens),
        simplification_time=row[f'reviewer1_time']
    )
  })
  row[f"reviewer1_issues"] = '' if str(row[f"reviewer1_issues"]) == 'nan' else row[f"reviewer1_issues"]
  row[f"reviewer1_rules"] = '' if str(row[f"reviewer1_rules"]) == 'nan' else row[f"reviewer1_rules"]
  reviewer1_raw_data.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    'reviewer1_text': row['reviewer1_text'],
    'reviewer1_tokens': result.simplified.basic.tokens,
    'reviewer1_lemmas': result.simplified.basic.lemmas,
    'reviewer1_issues': [i for i in row[f"reviewer1_issues"].lower().split('|') if i != ""],
    'reviewer1_rules': [r for r in row[f"reviewer1_rules"].lower().split('|') if r != ""]
  })

100%|██████████| 619/619 [00:26<00:00, 23.49it/s]


In [12]:
pd.DataFrame(reviewer1_metrics).to_csv(f"./corpora_with_metrics/reviewer1.csv", encoding="utf-8", index=False)
json.dump(reviewer1_raw_data, open(f"./corpora_with_metrics/reviewer1.json", 'w', encoding="utf-8"))

## Reviewer2

In [13]:
reviewer2_metrics = []
reviewer2_raw_data = []
for row in tqdm(reviewer_2_df.to_dict(orient="records")):
  result = SimplificationAnalyzer(row["original_text"], row[f"reviewer2_text"])
  reviewer2_metrics.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    'reviewer2_text': row[f'reviewer2_text'],
    # Basic
    'reviewer2_n_tokens': result.simplified.basic.n_tokens,
    'reviewer2_n_tokens_all': result.simplified.basic.n_tokens_all,
    'reviewer2_n_chars': result.simplified.basic.n_chars,
    'reviewer2_n_chars_all': result.simplified.basic.n_chars_all,
    'reviewer2_n_syllables': result.simplified.basic.n_syllables,
    'reviewer2_n_words': result.simplified.basic.n_words,
    'reviewer2_n_unique_lemmas': result.simplified.basic.n_unique_lemmas,
    'reviewer2_n_sentences': result.simplified.basic.n_sentences,
    # Pos
    'reviewer2_n_other': result.simplified.pos.n_other,
    'reviewer2_n_nouns': result.simplified.pos.n_nouns,
    'reviewer2_n_verbs': result.simplified.pos.n_verbs,
    'reviewer2_n_number': result.simplified.pos.n_number,
    'reviewer2_n_symbols': result.simplified.pos.n_symbols,
    'reviewer2_n_adverbs': result.simplified.pos.n_adverbs,
    'reviewer2_n_articles': result.simplified.pos.n_articles,
    'reviewer2_n_pronouns': result.simplified.pos.n_pronouns,
    'reviewer2_n_particles': result.simplified.pos.n_particles,
    'reviewer2_n_adjectives': result.simplified.pos.n_adjectives,
    'reviewer2_n_prepositions': result.simplified.pos.n_prepositions,
    'reviewer2_n_proper_nouns': result.simplified.pos.n_proper_nouns,
    'reviewer2_n_punctuations': result.simplified.pos.n_punctuations,
    'reviewer2_n_interjections': result.simplified.pos.n_interjections,
    'reviewer2_n_coordinating_conjunctions': result.simplified.pos.n_coordinating_conjunctions,
    'reviewer2_n_subordinating_conjunctions': result.simplified.pos.n_subordinating_conjunctions,
    # Verbs
    'reviewer2_n_active_verbs': result.simplified.verbs.n_active_verbs,
    'reviewer2_n_passive_verbs': result.simplified.verbs.n_passive_verbs,
    # Readability
    'reviewer2_ttr': result.simplified.readability.ttr,
    'reviewer2_gulpease': result.simplified.readability.gulpease,
    'reviewer2_flesch_vacca': result.simplified.readability.flesch_vacca,
    'reviewer2_lexical_density': result.simplified.readability.lexical_density,
    # VdB
    'reviewer2_n_vdb': result.simplified.vdb.n_vdb_tokens,
    'reviewer2_n_vdb_fo': result.simplified.vdb.n_vdb_fo_tokens,
    'reviewer2_n_vdb_au': result.simplified.vdb.n_vdb_au_tokens,
    'reviewer2_n_vdb_ad': result.simplified.vdb.n_vdb_ad_tokens,
    # Similariy
    'reviewer2_semantic_similarity': result.similarity.semantic_similarity,
    # Diff
    'reviewer2_editdistance': result.diff.editdistance,
    'reviewer2_n_added_tokens': result.diff.n_added_tokens,
    'reviewer2_n_deleted_tokens': result.diff.n_deleted_tokens,
    'reviewer2_n_added_vdb_tokens': result.diff.n_added_vdb_tokens,
    'reviewer2_n_deleted_vdb_tokens': result.diff.n_deleted_vdb_tokens,
    # Time & efficiency
    'reviewer2_reading_time': reading_time(result.simplified.basic.n_tokens),
    'reviewer2_time': row[f'reviewer2_time'],
    'reviewer2_efficiency': efficiency(
        original_reading_time=reading_time(result.reference.basic.n_tokens),
        simplified_reading_time=reading_time(result.simplified.basic.n_tokens),
        simplification_time=row[f'reviewer2_time']
    )
  })
  row[f"reviewer2_issues"] = '' if str(row[f"reviewer2_issues"]) == 'nan' else row[f"reviewer2_issues"]
  row[f"reviewer2_rules"] = '' if str(row[f"reviewer2_rules"]) == 'nan' else row[f"reviewer2_rules"]
  reviewer2_raw_data.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    'reviewer2_text': row['reviewer2_text'],
    'reviewer2_tokens': result.simplified.basic.tokens,
    'reviewer2_lemmas': result.simplified.basic.lemmas,
    'reviewer2_issues': [i for i in row[f"reviewer2_issues"].lower().split('|') if i != ""],
    'reviewer2_rules': [r for r in row[f"reviewer2_rules"].lower().split('|') if r != ""]
  })

100%|██████████| 619/619 [00:20<00:00, 30.44it/s]


In [14]:
pd.DataFrame(reviewer2_metrics).to_csv(f"./corpora_with_metrics/reviewer2.csv", encoding="utf-8", index=False)
json.dump(reviewer2_raw_data, open(f"./corpora_with_metrics/reviewer2.json", 'w', encoding="utf-8"))

## Gpt4

In [15]:
gpt4_metrics = []
gpt4_raw_data = []
for row in tqdm(gpt4_df.to_dict(orient="records")):
  result = SimplificationAnalyzer(row["original_text"], row[f"gpt4_text"])
  gpt4_metrics.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    'gpt4_text': row[f'gpt4_text'],
    # Basic
    'gpt4_n_tokens': result.simplified.basic.n_tokens,
    'gpt4_n_tokens_all': result.simplified.basic.n_tokens_all,
    'gpt4_n_chars': result.simplified.basic.n_chars,
    'gpt4_n_chars_all': result.simplified.basic.n_chars_all,
    'gpt4_n_syllables': result.simplified.basic.n_syllables,
    'gpt4_n_words': result.simplified.basic.n_words,
    'gpt4_n_unique_lemmas': result.simplified.basic.n_unique_lemmas,
    'gpt4_n_sentences': result.simplified.basic.n_sentences,
    # Pos
    'gpt4_n_other': result.simplified.pos.n_other,
    'gpt4_n_nouns': result.simplified.pos.n_nouns,
    'gpt4_n_verbs': result.simplified.pos.n_verbs,
    'gpt4_n_number': result.simplified.pos.n_number,
    'gpt4_n_symbols': result.simplified.pos.n_symbols,
    'gpt4_n_adverbs': result.simplified.pos.n_adverbs,
    'gpt4_n_articles': result.simplified.pos.n_articles,
    'gpt4_n_pronouns': result.simplified.pos.n_pronouns,
    'gpt4_n_particles': result.simplified.pos.n_particles,
    'gpt4_n_adjectives': result.simplified.pos.n_adjectives,
    'gpt4_n_prepositions': result.simplified.pos.n_prepositions,
    'gpt4_n_proper_nouns': result.simplified.pos.n_proper_nouns,
    'gpt4_n_punctuations': result.simplified.pos.n_punctuations,
    'gpt4_n_interjections': result.simplified.pos.n_interjections,
    'gpt4_n_coordinating_conjunctions': result.simplified.pos.n_coordinating_conjunctions,
    'gpt4_n_subordinating_conjunctions': result.simplified.pos.n_subordinating_conjunctions,
    # Verbs
    'gpt4_n_active_verbs': result.simplified.verbs.n_active_verbs,
    'gpt4_n_passive_verbs': result.simplified.verbs.n_passive_verbs,
    # Readability
    'gpt4_ttr': result.simplified.readability.ttr,
    'gpt4_gulpease': result.simplified.readability.gulpease,
    'gpt4_flesch_vacca': result.simplified.readability.flesch_vacca,
    'gpt4_lexical_density': result.simplified.readability.lexical_density,
    # VdB
    'gpt4_n_vdb': result.simplified.vdb.n_vdb_tokens,
    'gpt4_n_vdb_fo': result.simplified.vdb.n_vdb_fo_tokens,
    'gpt4_n_vdb_au': result.simplified.vdb.n_vdb_au_tokens,
    'gpt4_n_vdb_ad': result.simplified.vdb.n_vdb_ad_tokens,
    # Similariy
    'gpt4_semantic_similarity': result.similarity.semantic_similarity,
    # Diff
    'gpt4_editdistance': result.diff.editdistance,
    'gpt4_n_added_tokens': result.diff.n_added_tokens,
    'gpt4_n_deleted_tokens': result.diff.n_deleted_tokens,
    'gpt4_n_added_vdb_tokens': result.diff.n_added_vdb_tokens,
    'gpt4_n_deleted_vdb_tokens': result.diff.n_deleted_vdb_tokens,
    # Time & efficiency
    'gpt4_reading_time': reading_time(result.simplified.basic.n_tokens)
  })
  gpt4_raw_data.append({
    'document': row['document'],
    'paragraph_index': row['paragraph_index'],
    'original_text': row['original_text'],
    'gpt4_text': row['gpt4_text'],
    'gpt4_tokens': result.simplified.basic.tokens,
    'gpt4_lemmas': result.simplified.basic.lemmas,
  })

100%|██████████| 619/619 [00:21<00:00, 29.23it/s]


In [16]:
pd.DataFrame(gpt4_metrics).to_csv(f"./corpora_with_metrics/gpt4.csv", encoding="utf-8", index=False)
json.dump(gpt4_raw_data, open(f"./corpora_with_metrics/gpt4.json", 'w', encoding="utf-8"))