In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


 ## Metrics for text similarity

 **BERTSimilarity** from https://github.com/abhilash1910/BERTSimilarity measures the cosinus distance between word vectors (using BERT embeddings). 
* It is helpful when evaluating the changes in meaning.
* Range: 0 (very close) to 1 (very far)


 **BLEU** from *sumeval.metrics.bleu* relies on 1,2,3,4-grams overlaps and penalizes for big differences in size (we do not want to lose too much information). 
 * The score is originally used for evaluation of translations; abstractive sentence simplification can be considered a translation from complex language to a simpler one.
 * Range: 0 (bad) to 1 (unrealistically perfect)
 * The interpretation scale:

![bleu_score_range.png](https://cloud.google.com/translate/automl/docs/images/bleu_score_range.png)


 **ROUGE** from *sumeval.metrics.rouge* measures unigram (ROUGE-1), bigram (ROUGE-2), and longest subsequence (ROUGE-L) intersections.
* It is similar to BLEU, but helpful for evaluating how much original data is kept from the original sentence.
* Range: 0 (no overlap) to 1 (complete overlap of n-grams)

**QuestEval** from https://github.com/ThomasScialom/QuestEval assesses if two different inputs contain the same information.
* Said to perform better than BLEU and SARI (https://github.com/ThomasScialom/QuestEval#text-simplification)
* Range: 0 (no similarities) to 1 (contains the same information)

In [None]:
# installing QuestEval
%cd /content/drive/MyDrive/Tools
# !git clone https://github.com/ThomasScialom/QuestEval.git
%cd QuestEval
!pip install -e .

In [None]:
# libraries for calculating scores
!pip install BERTSimilarity
!pip install sumeval
!pip install sacrebleu==1.3.2

# libraries for data manipulation
!pip install pandas==1.3.0
!pip install tables==3.5.1

In [7]:
import os
import pandas as pd


def sentences_to_dataframe(original_doc, simplified_doc, label):
    """Return dataframe with original and simplified sentences

    Each sentence should be on a separate line.
    label -- a string to identify the source (e.g., a filename)
    """

    with open(original_doc, 'r') as f:
        original = f.read().splitlines()
    with open(simplified_doc, 'r') as f:
        simplified = f.read().splitlines()

    df = pd.DataFrame(data = list(zip(original, simplified)), 
                      columns = ['original', 'simplified'])
    df['split'] = df['simplified'].str.split('\. ')
    df.insert(loc = 0, 
              column = 'label', 
              value = label)

    return df


def docs_to_dataframe(original_dir, simplified_dir):
    """Return dataframe with original and simplified sentences from all sources"""

    df = pd.DataFrame()

    files = os.listdir(original_dir)
    for doc in files:
        doc_df = sentences_to_dataframe(original_doc = os.path.join(original_dir, doc), 
                                        simplified_doc = os.path.join(simplified_dir, doc), 
                                        label = doc)
        df = df.append(doc_df)

    return df


original_dir = '/content/drive/MyDrive/Tools/muss/data/original'
simplified_dir = '/content/drive/MyDrive/Tools/muss/data/simplified'

df_docs = docs_to_dataframe(original_dir, simplified_dir)

df_docs.head()

Unnamed: 0,label,original,simplified,split
0,wiki-clock-en.txt,A clock or a timepiece is a device used to mea...,A clock or timepiece is a piece of equipment t...,[A clock or timepiece is a piece of equipment ...
1,wiki-clock-en.txt,The clock is one of the oldest human invention...,The clock is one of the oldest human invention...,[The clock is one of the oldest human inventio...
2,wiki-clock-en.txt,Devices operating on several physical processe...,Several different physical processes have been...,[Several different physical processes have bee...
3,wiki-clock-en.txt,Some predecessors to the modern clock may be c...,Some predecessors to the modern clock may be c...,[Some predecessors to the modern clock may be ...
4,wiki-clock-en.txt,"There is a range of duration timers, a well-kn...",There are different types of timers. An exampl...,"[There are different types of timers, An examp..."


In [None]:
import BERTSimilarity.BERTSimilarity as bertsimilarity
from sumeval.metrics.bleu import BLEUCalculator
from sumeval.metrics.rouge import RougeCalculator
from questeval.questeval_metric import QuestEval


def calculate_bertsimilarity(df):
    """Calculate BERTSimilarity score"""

    bertsim = bertsimilarity.BERTSimilarity()

    df['BERTSimilarity'] = df.apply(lambda row : 
                            bertsim.calculate_distance(row['original'], row['simplified']),
                            axis=1)
    df['split BERTSimilarity'] = df.apply(lambda row : 
                                [bertsim.calculate_distance(row['original'], sent) 
                                for sent in row['split']
                                ], axis=1) 


def calculate_bleu(df):
    """Calculate BLEU score"""

    bleu = BLEUCalculator()

    df['BLEU'] = df.apply(lambda row : 
                        bleu.bleu(row['original'], row['simplified']),
                        axis=1)   
    df['split BLEU'] = df.apply(lambda row : 
                            [bleu.bleu(row['original'], sent) 
                            for sent in row['split']
                            ], axis=1)


def calculate_rouge(df):
    """Calculate ROUGE score

    ROUGE-1 and ROUGE-2 is based on uni- and bigrams overlap accordingly.
    ROUGE-L is based on the longest common subsequence.
    """

    rouge = RougeCalculator(stopwords=True, lang="en")
    # ROUGE-1
    df['ROUGE 1'] = df.apply(lambda row : 
                        rouge.rouge_n(row['original'], row['simplified'], n=1),
                        axis=1)  
    df['split ROUGE 1'] = df.apply(lambda row : 
                            [rouge.rouge_n(row['original'], sent, n=1) 
                            for sent in row['split']
                            ], axis=1)    
    # ROUGE-2
    df['ROUGE 2'] = df.apply(lambda row : 
                        rouge.rouge_n(row['original'], row['simplified'], n=2),
                        axis=1)
    df['split ROUGE 2'] = df.apply(lambda row : 
                            [rouge.rouge_n(row['original'], sent, n=2) 
                            for sent in row['split']
                            ], axis=1)    
    # ROUGE-L
    df['ROUGE L'] = df.apply(lambda row : 
                        rouge.rouge_l(row['original'], row['simplified']),
                        axis=1)
    df['split ROUGE L'] = df.apply(lambda row : 
                            [rouge.rouge_l(row['original'], sent) 
                            for sent in row['split']
                            ], axis=1) 


def calculate_questeval(df):
    """Calculate QuestEval score without references"""

    questeval = QuestEval()

    df['QuestEval'] = df.apply(lambda row : 
                        questeval.corpus_questeval([row['original']], [row['simplified']])['corpus_score'],
                        axis=1)
    df['split QuestEval'] = df.apply(lambda row : 
                            [questeval.corpus_questeval([row['original']], [sent])['corpus_score']
                            for sent in row['split']
                            ], axis=1)


def calculate_scores(df):
    """Return dataframe expanded with similarity scores:
    BERTSimilarity, BLEU, ROUGE-1, ROUGE-2, ROUGE-L.

    A sentence is compared to its simplified version 
    and, if multiple, to each sentence of the simplified version (split score).
    """

    calculate_bertsimilarity(df)
    calculate_bleu(df)
    calculate_rouge(df)
    calculate_questeval(df)
    
    return df


df_docs = calculate_scores(df_docs)
df_docs.to_csv('/content/drive/MyDrive/Colab Notebooks/data/simplification_scores.csv')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.de

# EASSE
https://github.com/feralvam/easse/

In [None]:
!pip install imgaug==0.2.5
!pip install sacrebleu==2.0
!pip install tupa
!pip install nltk==3.6.5
!pip install spacy==2.2.2
!pip install stanza

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

!python -m spacy download en_core_web_md

In [None]:
%cd /content/drive/MyDrive/Tools
# !git clone https://github.com/feralvam/easse.git
%cd easse
!pip install -e .

In [None]:
!sudo easse report -m 'sari,fkgl,sent_bleu,bertscore' -t turkcorpus_test < easse/resources/data/system_outputs/turkcorpus/test/ACCESS

In [None]:
import pandas as pd

original = df['original'].to_list()[:-2]
simplified = df['simplified'].to_list()[:-2]

with open('/content/drive/MyDrive/Colab Notebooks/data/original.txt', 'w') as f:
    f.write('\n'.join(original) + '\n')
with open('/content/drive/MyDrive/Colab Notebooks/data/simplified.txt', 'w') as f:
    f.write('\n'.join(simplified) + '\n')

In [None]:
!easse evaluate -m 'sari,fkgl,sent_bleu,bertscore' -t custom \
--orig_sents_path '/content/drive/MyDrive/Colab Notebooks/data/original.txt' \
--sys_sents_path '/content/drive/MyDrive/Colab Notebooks/data/simplified.txt' \
--refs_sents_paths '/content/drive/MyDrive/Colab Notebooks/data/original.txt' 

Downloading: 100% 482/482 [00:00<00:00, 393kB/s]
Downloading: 100% 878k/878k [00:00<00:00, 7.86MB/s]
Downloading: 100% 446k/446k [00:00<00:00, 4.49MB/s]
Downloading: 100% 1.29M/1.29M [00:00<00:00, 9.76MB/s]
Downloading: 100% 1.33G/1.33G [00:26<00:00, 53.2MB/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
{'sent_bleu': 27.908, 'sari': 17.45