In [3]:
from easse.sari import corpus_sari, get_corpus_sari_operation_scores
from sacrebleu import corpus_bleu
import pandas as pd
import numpy as np
from datasets import load_dataset

In [21]:
def calc_bleu_sari(original, sentences, references, tokenizer='13a'):

    num_refs = max([len(refs) for refs in references])

    bleu_scores = np.zeros((num_refs))
    sari_scores = np.zeros((num_refs))

    examples = [{"original": [], "sentences": [], "references": []} for _ in range(num_refs)]

    assert len(original) == len(sentences)
    assert len(sentences) == len(references)

    for original, refs, sentence in zip(original, references, sentences):
        simple = sentence
        num_ref = len(refs)
        examples[num_ref-1]['original'].append(original)
        examples[num_ref-1]['sentences'].append(simple)
        examples[num_ref-1]['references'].append(refs)

    counts = np.array([len(e['original']) for e in examples])
    total = sum(counts)
    weights = np.divide(counts, total)

    for i in range(len(examples)):
        if counts[i] > 0:
            references = np.array(examples[i]['references']).T.tolist()
            bleu_scores[i] = corpus_bleu(
                                examples[i]['sentences'],
                                references,
                                force = True,
                                tokenize = tokenizer,
                                lowercase = True
                            ).score
            sari_scores[i] = corpus_sari(
                                orig_sents = examples[i]['original'],
                                sys_sents = examples[i]['sentences'],
                                refs_sents = references,
                                tokenizer=tokenizer
                            )
    
    bleu = np.dot(bleu_scores, weights)
    sari = np.dot(sari_scores, weights)

    return bleu, sari

In [5]:
def to_words(text):
    return text.split()

def truncate(sentence):
    # Take first 80% words
    words = to_words(sentence)
    return ' '.join(words[: int(len(words) * 0.8)]) + '.'

In [4]:
## MODIFY THIS
dataset = "TSSlovene"

In [5]:
raw_datasets = load_dataset("./MultilingualSimplification.py", name=dataset)

test_set = raw_datasets["test"]

Downloading and preparing dataset multilingual_simplification/TSSlovene to /Users/michaelryan/.cache/huggingface/datasets/multilingual_simplification/TSSlovene/1.0.0/1ff38e5f95caa94278642e100d418265055dbe25b26e4bff4eb2be51de59a924...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset multilingual_simplification downloaded and prepared to /Users/michaelryan/.cache/huggingface/datasets/multilingual_simplification/TSSlovene/1.0.0/1ff38e5f95caa94278642e100d418265055dbe25b26e4bff4eb2be51de59a924. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
print("IDENTITY")
bleu, sari = calc_bleu_sari(test_set["original"], test_set["original"], [ex['simplifications'] for ex in test_set["simple"]])
print("BLEU", bleu)
print("SARI", sari)

print()

print("TRUNCATE")
bleu, sari = calc_bleu_sari(test_set["original"], [truncate(sentence) for sentence in test_set["original"]], [ex['simplifications'] for ex in test_set["simple"]])
print("BLEU", bleu)
print("SARI", sari)

IDENTITY
BLEU 7.761789172194769
SARI 5.934345427797628

TRUNCATE
BLEU 6.085824291507021
SARI 19.027115202574883


In [25]:
dataset = "EasyJapaneseExtended"

raw_datasets = load_dataset("./MultilingualSimplification.py", name=dataset)

test_set = raw_datasets["test"]

print("IDENTITY")
bleu, sari = calc_bleu_sari(test_set["original"], test_set["original"], [ex['simplifications'] for ex in test_set["simple"]], 'intl')
print("BLEU", bleu)
print("SARI", sari)

print()

print("TRUNCATE")
bleu, sari = calc_bleu_sari(test_set["original"], [sentence[: int(len(sentence) * 0.8)] + "。" for sentence in test_set["original"]], [ex['simplifications'] for ex in test_set["simple"]], 'intl')
print("BLEU", bleu)
print("SARI", sari)

Downloading and preparing dataset multilingual_simplification/EasyJapaneseExtended to /Users/michaelryan/.cache/huggingface/datasets/multilingual_simplification/EasyJapaneseExtended/1.0.0/1ff38e5f95caa94278642e100d418265055dbe25b26e4bff4eb2be51de59a924...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset multilingual_simplification downloaded and prepared to /Users/michaelryan/.cache/huggingface/datasets/multilingual_simplification/EasyJapaneseExtended/1.0.0/1ff38e5f95caa94278642e100d418265055dbe25b26e4bff4eb2be51de59a924. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

IDENTITY
BLEU 20.230626941116604
SARI 9.000209549720857

TRUNCATE
BLEU 8.812058613684762
SARI 43.8435071245527


In [6]:
datasets = ["NewselaEN", "WikiAutoEN", "ASSET", "Simplext", "NewselaES", "Terence", "Teacher", "SimpitikiWiki",
"AdminIt", "PaCCSS-IT", "CLEAR", "WikiLargeFR", "EasyJapanese", "EasyJapaneseExtended", "PorSimples", "TextComplexityDE", 
"GEOLinoTest", "GermanNews", "CBST", "DSim", "SimplifyUR", "RuWikiLarge", "RSSE", "RuAdaptLit", "RuAdaptFairytales", "RuAdaptEncy"]

In [7]:
def calc_bleu_sari_parts(df_ref, sentences, tokenizer='13a'):

    num_refs = df_ref.shape[1]-1

    bleu_scores = np.zeros((num_refs))
    sari_keep_scores = np.zeros((num_refs))
    sari_add_scores = np.zeros((num_refs))
    sari_del_scores = np.zeros((num_refs))

    sentences = [sent if type(sent) == str else "" for sent in sentences]

    examples = [{"original": [], "sentences": [], "references": []} for _ in range(num_refs)]

    assert df_ref.shape[0] == len(sentences)

    for (index,row), sentence in zip(df_ref.iterrows(), sentences):
        original = row['original']
        simple = sentence
        ref_list = []
        for col in row.index:
            if col != 'original' and type(row[col]) != float:
                ref_list.append(row[col])
        num_ref = len(ref_list)
        examples[num_ref-1]['original'].append(original)
        examples[num_ref-1]['sentences'].append(simple)
        examples[num_ref-1]['references'].append(ref_list)

    counts = np.array([len(e['original']) for e in examples])
    total = sum(counts)
    weights = np.divide(counts, total)

    for i in range(len(examples)):
        if counts[i] > 0:
            references = np.array(examples[i]['references']).T.tolist()
            bleu_scores[i] = corpus_bleu(
                                examples[i]['sentences'],
                                references,
                                force = True,
                                tokenize = tokenizer,
                                lowercase = True
                            ).score
            sari_add_scores[i], sari_keep_scores[i], sari_del_scores[i] = get_corpus_sari_operation_scores(
                                orig_sents = examples[i]['original'],
                                sys_sents = examples[i]['sentences'],
                                refs_sents = references,
                                tokenizer = tokenizer
                            )

    bleu = np.dot(bleu_scores, weights)
    sari_add = np.dot(sari_add_scores, weights)
    sari_keep = np.dot(sari_keep_scores, weights)
    sari_del = np.dot(sari_del_scores, weights)

    return bleu, sari_add, sari_keep, sari_del

In [12]:
subcorpora = {
    "NewselaEN": {
        "path": "./data/English/Newsela EN",
        "language": "en"
    },
    "WikiAutoEN": {
        "path": "./data/English/WikiAuto",
        "language": "en"
    },
    "ASSET": {
        "path": "./data/English/ASSET",
        "language": "en"
    },
    "Simplext": {
        "path": "./data/Spanish/Simplext",
        "language": "es"
    },
    "NewselaES": {
        "path": "./data/Spanish/Newsela ES",
        "language": "es"
    },
    "Terence": {
        "path" : "./data/Italian/Terence",
        "language": "it"
    },
    "Teacher": {
        "path": "./data/Italian/Teacher",
        "language": "it"
    },
    "SimpitikiWiki": {
        "path": "./data/Italian/Simpitiki Italian Wikipedia",
        "language": "it"
    },
    "AdminIt": {
        "path": "./data/Italian/AdminIT",
        "language": "it"
    },
    "PaCCSS-IT": {
        "path": "./data/Italian/PaCCSS-IT Corpus",
        "language": "it"
    },
    "CLEAR" : {
        "path" : "./data/French/CLEAR Corpus",
        "language": "fr"
    },
    "WikiLargeFR": {
        "path" : "./data/French/WikiLargeFR Corpus",
        "language": "fr"
    },
    "EasyJapanese": {
        "path": "./data/Japanese/Easy Japanese Corpus",
        "language": "ja"
    },
    "EasyJapaneseExtended": {
        "path": "./data/Japanese/Easy Japanese Extended",
        "language": "ja"
    },
    "PorSimples" : {
        "path": "./data/Brazilian Portuguese/PorSimples",
        "language": "pt-br"
    },
    "TextComplexityDE" : {
        "path": "./data/German/TextComplexityDE Parallel Corpus",
        "language": "de"
    },
    "GEOLinoTest" : {
        "path" : "./data/German/GEOLino Corpus",
        "language": "de"
    },
    "GermanNews" : {
        "path" : "./data/German/German News",
        "language": "de"
    },
    "CBST": {
        "path" : "./data/Basque/CBST",
        "language": "eu"
    },
    "DSim": {
        "path": "./data/Danish/DSim Corpus",
        "language": "da"
    },
    "SimplifyUR": {
        "path": "./data/Urdu/SimplifyUR",
        "language": "ur"
    },
    "RuWikiLarge": {
        "path" : "./data/Russian/RuWikiLarge",
        "language": "ru"
    },
    "RSSE" : {
        "path": "./data/Russian/RSSE Corpus",
        "language": "ru"
    },
    "RuAdaptLit" : {
        "path": "./data/Russian/RuAdapt Literature",
        "language": "ru"
    },
    "RuAdaptFairytales" : {
        "path": "./data/Russian/RuAdapt Fairytales",
        "language": "ru"
    },
    "RuAdaptEncy" : {
        "path" : "./data/Russian/RuAdapt Ency",
        "language": "ru"
    },
    "TSSlovene" : {
        "path" : "./data/Slovene/Text Simplification Slovene",
        "language": "sl"
    }
}

In [14]:
for dataset in subcorpora:
    ref_df = pd.read_csv(subcorpora[dataset]["path"] + "_test.csv")

    print(dataset)
    print("IDENTITY")
    scores = calc_bleu_sari_parts(ref_df, ref_df['original'].to_list(), tokenizer='intl' if "Japanese" in dataset else '13a')
    print(dataset + "," + str(scores[1]) + "," + str(scores[2]) + "," + str(scores[3]))

    print()

    print(dataset)
    print("TRUNCATE")
    scores = calc_bleu_sari_parts(ref_df, [truncate(sentence) for sentence in ref_df['original'].to_list()], tokenizer='intl' if "Japanese" in dataset else '13a')
    print(dataset + "," + str(scores[1]) + "," + str(scores[2]) + "," + str(scores[3]))
    print()

NewselaEN
IDENTITY
NewselaEN,0.0,78.511591740056,0.0

NewselaEN
TRUNCATE
NewselaEN,0.3890134752767037,68.63954316734663,29.658434033531325

WikiAutoEN
IDENTITY
WikiAutoEN,0.0,62.78409703329144,0.0

WikiAutoEN
TRUNCATE
WikiAutoEN,0.17481053218016177,57.866935305343645,36.32123901972068

ASSET
IDENTITY
ASSET,0.0,62.201479040615006,0.0

ASSET
TRUNCATE
ASSET,0.21134294555943267,55.04221791958657,33.735078737222885

Simplext
IDENTITY
Simplext,0.0,23.83033499752721,0.0

Simplext
TRUNCATE
Simplext,0.5757725280948548,23.986912417790066,36.261247733672576

NewselaES
IDENTITY
NewselaES,0.0,72.63455558970311,0.0

NewselaES
TRUNCATE
NewselaES,0.12594589496927475,64.07410927209675,30.716482047510794

Terence
IDENTITY
Terence,0.0,80.50374388524759,0.0

Terence
TRUNCATE
Terence,0.40816326530612246,69.31341402250749,28.741627391748743

Teacher
IDENTITY
Teacher,0.0,52.22341206564713,0.0

Teacher
TRUNCATE
Teacher,0.48543689320388345,47.46083296920002,35.29770927364417

SimpitikiWiki
IDENTITY
SimpitikiWi

In [18]:
for dataset in ["EasyJapanese", "EasyJapaneseExtended"]:
    ref_df = pd.read_csv(subcorpora[dataset]["path"] + "_test.csv")
    print(dataset)
    print("TRUNCATE")
    scores = calc_bleu_sari_parts(ref_df, [sentence[: int(len(sentence) * 0.8)] + "。" for sentence in ref_df['original'].to_list()], tokenizer='intl' if "Japanese" in dataset else '13a')
    print(dataset + "," + str(scores[1]) + "," + str(scores[2]) + "," + str(scores[3]))
    print()

EasyJapanese
TRUNCATE
EasyJapanese,0.0,31.123073415945424,64.40100348204848

EasyJapaneseExtended
TRUNCATE
EasyJapaneseExtended,0.0,35.93346590149011,95.597055472168

