# Benchmark classical approaches

In [1]:
import os
import sys
import nltk
from tqdm import trange
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
sys.path.append(os.path.join(os.getcwd(), "..", ".."))
from src.preprocessing.corpus_preprocessor import CorpusPreprocessor
from src.preprocessing.functions import *
from src.preprocessing.consts import EMAIL_REGEX
from src.ds_loaders.xsum import XSumLoader
from src.metrics import BLEU, ROUGE
from src.model import *

[nltk_data] Downloading package punkt to /home/pasha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pasha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pasha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/pasha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
2022-05-26 15:45:11.862237: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


DEBUG: Loaded GloVe embedings from /home/pasha/Documents/ML/ML_project_summarization/src/embedings/../../data/glove/glove.6B.100d.txt


## Load dataset

In [2]:
loader = XSumLoader()
loader.load()

Using custom data configuration default
Reusing dataset xsum (/home/pasha/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
orig_documents, orig_summaries = loader.X_val, loader.y_val

## Preprocess dataset

In [4]:
# preprocessor = CorpusPreprocessor(verbose=False).\
#     add(to_lower()).\
#     add(expand_contractions()).\
#     add(replace_by_regex([(EMAIL_REGEX, "EMAIL")])).\
#     add(remove_by_regex([r"\d"])).\
#     add(remove_symbols(string.punctuation.replace('.', ''))).\
#     add(lemmatize())
#     # add(remove_words(set(stopwords.words('english'))))


In [5]:
# documents = []
# summaries = []
# for idx in trange(len(orig_documents)):
#     doc = orig_documents[idx]
#     summ = orig_summaries[idx]
#     if len(doc) == 0 or len(summ) == 0:
#         continue
#     documents.append(preprocessor.transform(doc))
#     summaries.append(preprocessor.transform(summ))

## Create models and metrics and preprocessors

In [21]:
# 1. models
summary_size = 1
TextRank = TextRankModel(summary_size)
LexRank = LexRankModel(summary_size)
Lsa = LsaModel(summary_size)
Luhn = LuhnModel(summary_size)
KL = KLModel(summary_size)
MyTextRank = MyTextRankModel(summary_size)

models = [ TextRank, LexRank, Lsa, Luhn, KL, MyTextRank ]

DEBUG: Loaded GloVe embedings from /home/pasha/Documents/ML/ML_project_summarization/src/embedings/../../data/glove/glove.6B.100d.txt


In [22]:
# 2. metrics
bleu1 = BLEU(1)
bleu2 = BLEU(2)
bleu3 = BLEU(3)
bleu4 = BLEU(4)

rouge1_f1 = ROUGE("rouge1", "fmeasure")
rouge2_f1 = ROUGE("rouge2", "fmeasure")
rougeL_f1 = ROUGE("rougeL", "fmeasure")
rouge1_r = ROUGE("rouge1", "recall")
rouge2_r = ROUGE("rouge2", "recall")
rougeL_r = ROUGE("rougeL", "recall")

metrics = [ bleu1, bleu2, bleu3, bleu4,
            rouge1_f1, rouge2_f1, rougeL_f1, rouge1_r, rouge2_r, rougeL_r ]

In [23]:
# 3. preprocessors
preprocessor_strong = CorpusPreprocessor(name="strong").\
    add(to_lower()).\
    add(expand_contractions()).\
    add(replace_by_regex([(EMAIL_REGEX, "EMAIL")])).\
    add(remove_by_regex([r"\d"])).\
    add(remove_symbols(string.punctuation.replace('.', ''))).\
    add(lemmatize()).\
    add(remove_words(set(stopwords.words('english'))))

preprocessor_mid = CorpusPreprocessor(name="mid").\
    add(to_lower()).\
    add(expand_contractions()).\
    add(replace_by_regex([(EMAIL_REGEX, "EMAIL")])).\
    add(remove_by_regex([r"\d"])).\
    add(remove_symbols(string.punctuation.replace('.', '')))

preprocessor_simple = CorpusPreprocessor(name="simple").\
    add(to_lower()).\
    add(replace_by_regex([(EMAIL_REGEX, "EMAIL")])).\
    add(remove_symbols(string.punctuation.replace('.', '')))

preprocessor_no = CorpusPreprocessor(name="nothing")
preprocessors = [preprocessor_strong, preprocessor_mid, preprocessor_simple, preprocessor_no]

In [24]:
result = {}
def evaluate_models(preprocessors,
                    models,
                    metrics,
                    documents,
                    summaries):
    n_jobs = len(models) * len(preprocessors)
    n_done = 0
    for prep in preprocessors:
        result[str(prep)] = {}
        print(f"Starting with preprocessor {prep}")
        prep_docs = prep.transform(documents)
        prep_sum = prep.transform(summaries)
        for model in models:
            result[str(prep)][str(model)] = {}
            print(f"    Model: {model}")
            model.fit(prep_docs)
            pred = model.predict(prep_docs)
            # print(f"        ", end="")
            for m_idx, metric in enumerate(metrics):
                print(f"      {metric}... ", end="")
                metric.reset_state()
                metric.update_state(pred, prep_sum)
                value = metric.result()
                result[str(prep)][str(model)][str(metric)] = value
                print(f"{value} - {m_idx+1} / {len(metrics)}")
            print()
            n_done += 1
            print(f"Done: {n_done}/{n_jobs}")
    return result

In [26]:
models = [ TextRank, LexRank, Lsa, Luhn, KL]
evaluate_models(
    preprocessors,
    models,
    metrics,
    orig_documents,
    orig_summaries
)

Starting with preprocessor CourpusPreprocessor(strong)
    Model: TextRank


100%|██████████| 11332/11332 [01:11<00:00, 159.57it/s]


      BLEU-1... 0.09680720881076871 - 1 / 10
      BLEU-2... 0.03845039955288756 - 2 / 10
      BLEU-3... 0.015479914618198075 - 3 / 10
      BLEU-4... 0.006538746856412794 - 4 / 10
      ROUGE-1-fmeasure... 0.13007354789886896 - 5 / 10
      ROUGE-2-fmeasure... 0.0208842889198962 - 6 / 10
      ROUGE-L-fmeasure... 0.10526554573311184 - 7 / 10
      ROUGE-1-recall... 0.16140673311994297 - 8 / 10
      ROUGE-2-recall... 0.02659906348806116 - 9 / 10
      ROUGE-L-recall... 0.1306985926412772 - 10 / 10

Done: 1/20
    Model: LexRank


100%|██████████| 11332/11332 [01:15<00:00, 149.35it/s]


      BLEU-1... 0.10481592471646123 - 1 / 10
      BLEU-2... 0.039742980396033 - 2 / 10
      BLEU-3... 0.015753875527716815 - 3 / 10
      BLEU-4... 0.0065695067986235355 - 4 / 10
      ROUGE-1-fmeasure... 0.11485304673723976 - 5 / 10
      ROUGE-2-fmeasure... 0.01652320625328094 - 6 / 10
      ROUGE-L-fmeasure... 0.09676674416946662 - 7 / 10
      ROUGE-1-recall... 0.1174883190936935 - 8 / 10
      ROUGE-2-recall... 0.016932073694746697 - 9 / 10
      ROUGE-L-recall... 0.09850349639780925 - 10 / 10

Done: 2/20
    Model: Lsa


100%|██████████| 11332/11332 [01:08<00:00, 164.28it/s]


      BLEU-1... 0.0803511984021305 - 1 / 10
      BLEU-2... 0.028651601505534207 - 2 / 10
      BLEU-3... 0.010071892381747327 - 3 / 10
      BLEU-4... 0.0037947476650204777 - 4 / 10
      ROUGE-1-fmeasure... 0.10476471593510817 - 5 / 10
      ROUGE-2-fmeasure... 0.013693276602089124 - 6 / 10
      ROUGE-L-fmeasure... 0.08563711713453832 - 7 / 10
      ROUGE-1-recall... 0.12911907538120887 - 8 / 10
      ROUGE-2-recall... 0.01728573506100953 - 9 / 10
      ROUGE-L-recall... 0.10572051131163052 - 10 / 10

Done: 3/20
    Model: Luhn


100%|██████████| 11332/11332 [00:33<00:00, 335.49it/s]


      BLEU-1... 0.0997217117353636 - 1 / 10
      BLEU-2... 0.038942402099813006 - 2 / 10
      BLEU-3... 0.01567320532235933 - 3 / 10
      BLEU-4... 0.006576033162646193 - 4 / 10
      ROUGE-1-fmeasure... 0.12341915771513523 - 5 / 10
      ROUGE-2-fmeasure... 0.018879409441528702 - 6 / 10
      ROUGE-L-fmeasure... 0.10148815146153342 - 7 / 10
      ROUGE-1-recall... 0.14095261885928628 - 8 / 10
      ROUGE-2-recall... 0.022271970583523995 - 9 / 10
      ROUGE-L-recall... 0.11565436624043832 - 10 / 10

Done: 4/20
    Model: KL


100%|██████████| 11332/11332 [04:02<00:00, 46.64it/s] 


      BLEU-1... 0.09326026604450956 - 1 / 10
      BLEU-2... 0.036853767867407 - 2 / 10
      BLEU-3... 0.014707626668989598 - 3 / 10
      BLEU-4... 0.006128522855635166 - 4 / 10
      ROUGE-1-fmeasure... 0.1240853831154804 - 5 / 10
      ROUGE-2-fmeasure... 0.01995849666908742 - 6 / 10
      ROUGE-L-fmeasure... 0.101089304103051 - 7 / 10
      ROUGE-1-recall... 0.15153945577721745 - 8 / 10
      ROUGE-2-recall... 0.024687617587305297 - 9 / 10
      ROUGE-L-recall... 0.1236166937011883 - 10 / 10

Done: 5/20
Starting with preprocessor CourpusPreprocessor(mid)
    Model: TextRank


100%|██████████| 11332/11332 [02:03<00:00, 91.99it/s] 


      BLEU-1... 0.13328274214271224 - 1 / 10
      BLEU-2... 0.048656300728191486 - 2 / 10
      BLEU-3... 0.021247500803551932 - 3 / 10
      BLEU-4... 0.010825413380285477 - 4 / 10
      ROUGE-1-fmeasure... 0.1737770173102477 - 5 / 10
      ROUGE-2-fmeasure... 0.02392013734959183 - 6 / 10
      ROUGE-L-fmeasure... 0.12700778032626237 - 7 / 10
      ROUGE-1-recall... 0.23227950561239766 - 8 / 10
      ROUGE-2-recall... 0.032485066915468486 - 9 / 10
      ROUGE-L-recall... 0.16973331484158827 - 10 / 10

Done: 6/20
    Model: LexRank


100%|██████████| 11332/11332 [01:50<00:00, 102.28it/s]


      BLEU-1... 0.15236908696095924 - 1 / 10
      BLEU-2... 0.05438131186722801 - 2 / 10
      BLEU-3... 0.023584992852121775 - 3 / 10
      BLEU-4... 0.011834011833242775 - 4 / 10
      ROUGE-1-fmeasure... 0.16992625547062468 - 5 / 10
      ROUGE-2-fmeasure... 0.02231125079419743 - 6 / 10
      ROUGE-L-fmeasure... 0.12551699124799914 - 7 / 10
      ROUGE-1-recall... 0.19111664879870754 - 8 / 10
      ROUGE-2-recall... 0.02541395858745056 - 9 / 10
      ROUGE-L-recall... 0.14060801185951188 - 10 / 10

Done: 7/20
    Model: Lsa


100%|██████████| 11332/11332 [01:14<00:00, 151.31it/s]


      BLEU-1... 0.14039013378059348 - 1 / 10
      BLEU-2... 0.04377713219596531 - 2 / 10
      BLEU-3... 0.017570133446291658 - 3 / 10
      BLEU-4... 0.008184124459291519 - 4 / 10
      ROUGE-1-fmeasure... 0.15006566579382366 - 5 / 10
      ROUGE-2-fmeasure... 0.015368106633386376 - 6 / 10
      ROUGE-L-fmeasure... 0.1095604887508266 - 7 / 10
      ROUGE-1-recall... 0.15805597935565485 - 8 / 10
      ROUGE-2-recall... 0.01632486938573582 - 9 / 10
      ROUGE-L-recall... 0.11519459613044042 - 10 / 10

Done: 8/20
    Model: Luhn


100%|██████████| 11332/11332 [00:40<00:00, 278.28it/s]


      BLEU-1... 0.13778710699619526 - 1 / 10
      BLEU-2... 0.050415753104553325 - 2 / 10
      BLEU-3... 0.0221507169304994 - 3 / 10
      BLEU-4... 0.011323545356783584 - 4 / 10
      ROUGE-1-fmeasure... 0.1748245382899577 - 5 / 10
      ROUGE-2-fmeasure... 0.024308206532682262 - 6 / 10
      ROUGE-L-fmeasure... 0.12669470576389383 - 7 / 10
      ROUGE-1-recall... 0.22384845187665797 - 8 / 10
      ROUGE-2-recall... 0.031609504519659284 - 9 / 10
      ROUGE-L-recall... 0.16215418239666754 - 10 / 10

Done: 9/20
    Model: KL


100%|██████████| 11332/11332 [05:13<00:00, 36.12it/s] 


      BLEU-1... 0.14889495406736786 - 1 / 10
      BLEU-2... 0.05524890100262196 - 2 / 10
      BLEU-3... 0.024375345550722648 - 3 / 10
      BLEU-4... 0.012407283399724722 - 4 / 10
      ROUGE-1-fmeasure... 0.1732041720417614 - 5 / 10
      ROUGE-2-fmeasure... 0.02486516104461113 - 6 / 10
      ROUGE-L-fmeasure... 0.1304430548450467 - 7 / 10
      ROUGE-1-recall... 0.19718918428801246 - 8 / 10
      ROUGE-2-recall... 0.028462736257747704 - 9 / 10
      ROUGE-L-recall... 0.14876444060374563 - 10 / 10

Done: 10/20
Starting with preprocessor CourpusPreprocessor(simple)
    Model: TextRank


100%|██████████| 11332/11332 [01:56<00:00, 97.43it/s] 


      BLEU-1... 0.12713525496893147 - 1 / 10
      BLEU-2... 0.046807890995236065 - 2 / 10
      BLEU-3... 0.020437044028562175 - 3 / 10
      BLEU-4... 0.010410460704587378 - 4 / 10
      ROUGE-1-fmeasure... 0.1718748901520785 - 5 / 10
      ROUGE-2-fmeasure... 0.024049386921585918 - 6 / 10
      ROUGE-L-fmeasure... 0.12429937512675897 - 7 / 10
      ROUGE-1-recall... 0.24443967980315942 - 8 / 10
      ROUGE-2-recall... 0.03491945725217724 - 9 / 10
      ROUGE-L-recall... 0.17655753528189244 - 10 / 10

Done: 11/20
    Model: LexRank


100%|██████████| 11332/11332 [01:46<00:00, 106.74it/s]


      BLEU-1... 0.1446459123885262 - 1 / 10
      BLEU-2... 0.051181758416581576 - 2 / 10
      BLEU-3... 0.02175756883922734 - 3 / 10
      BLEU-4... 0.010758155732988413 - 4 / 10
      ROUGE-1-fmeasure... 0.16775743787487649 - 5 / 10
      ROUGE-2-fmeasure... 0.02155647351941953 - 6 / 10
      ROUGE-L-fmeasure... 0.12319120247233245 - 7 / 10
      ROUGE-1-recall... 0.1987319579968772 - 8 / 10
      ROUGE-2-recall... 0.026145080225413786 - 9 / 10
      ROUGE-L-recall... 0.14514186373492843 - 10 / 10

Done: 12/20
    Model: Lsa


100%|██████████| 11332/11332 [01:09<00:00, 162.13it/s]


      BLEU-1... 0.13650967256818647 - 1 / 10
      BLEU-2... 0.042758517501609104 - 2 / 10
      BLEU-3... 0.01721009033327554 - 3 / 10
      BLEU-4... 0.008113824126423549 - 4 / 10
      ROUGE-1-fmeasure... 0.1480195374627037 - 5 / 10
      ROUGE-2-fmeasure... 0.015266478905297552 - 6 / 10
      ROUGE-L-fmeasure... 0.10780956572757572 - 7 / 10
      ROUGE-1-recall... 0.15933852233660672 - 8 / 10
      ROUGE-2-recall... 0.016605423235127027 - 9 / 10
      ROUGE-L-recall... 0.11568566884625367 - 10 / 10

Done: 13/20
    Model: Luhn


100%|██████████| 11332/11332 [00:39<00:00, 288.81it/s]


      BLEU-1... 0.13150909165976526 - 1 / 10
      BLEU-2... 0.04849586120455783 - 2 / 10
      BLEU-3... 0.021305438845896273 - 3 / 10
      BLEU-4... 0.010893104475261386 - 4 / 10
      ROUGE-1-fmeasure... 0.1730207103891796 - 5 / 10
      ROUGE-2-fmeasure... 0.02424853559157407 - 6 / 10
      ROUGE-L-fmeasure... 0.12427738350170178 - 7 / 10
      ROUGE-1-recall... 0.23493828030994746 - 8 / 10
      ROUGE-2-recall... 0.03367059997950754 - 9 / 10
      ROUGE-L-recall... 0.16850376864381789 - 10 / 10

Done: 14/20
    Model: KL


100%|██████████| 11332/11332 [04:45<00:00, 39.66it/s] 


      BLEU-1... 0.14271443420241867 - 1 / 10
      BLEU-2... 0.052853163379008236 - 2 / 10
      BLEU-3... 0.023187574706823977 - 3 / 10
      BLEU-4... 0.011820621501152343 - 4 / 10
      ROUGE-1-fmeasure... 0.17030092522439033 - 5 / 10
      ROUGE-2-fmeasure... 0.024364579257659554 - 6 / 10
      ROUGE-L-fmeasure... 0.1278864343152024 - 7 / 10
      ROUGE-1-recall... 0.20010863222693276 - 8 / 10
      ROUGE-2-recall... 0.02882939738489885 - 9 / 10
      ROUGE-L-recall... 0.1503124088613697 - 10 / 10

Done: 15/20
Starting with preprocessor CourpusPreprocessor(nothing)
    Model: TextRank


100%|██████████| 11332/11332 [02:04<00:00, 91.36it/s] 


      BLEU-1... 0.1183291279179867 - 1 / 10
      BLEU-2... 0.041309305416122856 - 2 / 10
      BLEU-3... 0.017233029358749624 - 3 / 10
      BLEU-4... 0.008541085147977232 - 4 / 10
      ROUGE-1-fmeasure... 0.1749891868903905 - 5 / 10
      ROUGE-2-fmeasure... 0.024777554163219917 - 6 / 10
      ROUGE-L-fmeasure... 0.12691535508665053 - 7 / 10
      ROUGE-1-recall... 0.22900056012281328 - 8 / 10
      ROUGE-2-recall... 0.032915738919335955 - 9 / 10
      ROUGE-L-recall... 0.16612957796185182 - 10 / 10

Done: 16/20
    Model: LexRank


100%|██████████| 11332/11332 [01:53<00:00, 100.04it/s]


      BLEU-1... 0.13190202451151645 - 1 / 10
      BLEU-2... 0.04551639021408223 - 2 / 10
      BLEU-3... 0.019049838085488227 - 3 / 10
      BLEU-4... 0.00930966524469624 - 4 / 10
      ROUGE-1-fmeasure... 0.17133295852565505 - 5 / 10
      ROUGE-2-fmeasure... 0.02340992135567297 - 6 / 10
      ROUGE-L-fmeasure... 0.12538805889306803 - 7 / 10
      ROUGE-1-recall... 0.19127778683648197 - 8 / 10
      ROUGE-2-recall... 0.026266602597093347 - 9 / 10
      ROUGE-L-recall... 0.1394771515949717 - 10 / 10

Done: 17/20
    Model: Lsa


100%|██████████| 11332/11332 [01:21<00:00, 139.09it/s]


      BLEU-1... 0.12036051155592326 - 1 / 10
      BLEU-2... 0.035969610832210966 - 2 / 10
      BLEU-3... 0.013778662027315135 - 3 / 10
      BLEU-4... 0.006142563999480615 - 4 / 10
      ROUGE-1-fmeasure... 0.15267969386617536 - 5 / 10
      ROUGE-2-fmeasure... 0.016359332554800356 - 6 / 10
      ROUGE-L-fmeasure... 0.11017996576851596 - 7 / 10
      ROUGE-1-recall... 0.1603366004374084 - 8 / 10
      ROUGE-2-recall... 0.017324588468086828 - 9 / 10
      ROUGE-L-recall... 0.11556900257099736 - 10 / 10

Done: 18/20
    Model: Luhn


100%|██████████| 11332/11332 [00:43<00:00, 259.44it/s]


      BLEU-1... 0.12112938852227359 - 1 / 10
      BLEU-2... 0.042327717238092644 - 2 / 10
      BLEU-3... 0.017866493364940974 - 3 / 10
      BLEU-4... 0.008896621397643781 - 4 / 10
      ROUGE-1-fmeasure... 0.17573443884667378 - 5 / 10
      ROUGE-2-fmeasure... 0.024960913672791008 - 6 / 10
      ROUGE-L-fmeasure... 0.12622903652363654 - 7 / 10
      ROUGE-1-recall... 0.22102281050076938 - 8 / 10
      ROUGE-2-recall... 0.031862008359402325 - 9 / 10
      ROUGE-L-recall... 0.15876822221719286 - 10 / 10

Done: 19/20
    Model: KL


100%|██████████| 11332/11332 [05:42<00:00, 33.06it/s] 


      BLEU-1... 0.1298369096999311 - 1 / 10
      BLEU-2... 0.046572576360524295 - 2 / 10
      BLEU-3... 0.019946079180631883 - 3 / 10
      BLEU-4... 0.009792576427229386 - 4 / 10
      ROUGE-1-fmeasure... 0.17444191867975511 - 5 / 10
      ROUGE-2-fmeasure... 0.02601001773707392 - 6 / 10
      ROUGE-L-fmeasure... 0.13029782298150355 - 7 / 10
      ROUGE-1-recall... 0.197104334621429 - 8 / 10
      ROUGE-2-recall... 0.02948479834138394 - 9 / 10
      ROUGE-L-recall... 0.1474663058305305 - 10 / 10

Done: 20/20


{'CourpusPreprocessor(strong)': {'TextRank': {'BLEU-1': 0.09680720881076871,
   'BLEU-2': 0.03845039955288756,
   'BLEU-3': 0.015479914618198075,
   'BLEU-4': 0.006538746856412794,
   'ROUGE-1-fmeasure': 0.13007354789886896,
   'ROUGE-2-fmeasure': 0.0208842889198962,
   'ROUGE-L-fmeasure': 0.10526554573311184,
   'ROUGE-1-recall': 0.16140673311994297,
   'ROUGE-2-recall': 0.02659906348806116,
   'ROUGE-L-recall': 0.1306985926412772},
  'LexRank': {'BLEU-1': 0.10481592471646123,
   'BLEU-2': 0.039742980396033,
   'BLEU-3': 0.015753875527716815,
   'BLEU-4': 0.0065695067986235355,
   'ROUGE-1-fmeasure': 0.11485304673723976,
   'ROUGE-2-fmeasure': 0.01652320625328094,
   'ROUGE-L-fmeasure': 0.09676674416946662,
   'ROUGE-1-recall': 0.1174883190936935,
   'ROUGE-2-recall': 0.016932073694746697,
   'ROUGE-L-recall': 0.09850349639780925},
  'Lsa': {'BLEU-1': 0.0803511984021305,
   'BLEU-2': 0.028651601505534207,
   'BLEU-3': 0.010071892381747327,
   'BLEU-4': 0.0037947476650204777,
   'ROUGE

In [28]:
from pprint import pprint
import json
pprint(result)
with open("bench_results.json", 'w') as f:
    json.dump(result, f, indent=4)

{'CourpusPreprocessor(mid)': {'KL': {'BLEU-1': 0.14889495406736786,
                                     'BLEU-2': 0.05524890100262196,
                                     'BLEU-3': 0.024375345550722648,
                                     'BLEU-4': 0.012407283399724722,
                                     'ROUGE-1-fmeasure': 0.1732041720417614,
                                     'ROUGE-1-recall': 0.19718918428801246,
                                     'ROUGE-2-fmeasure': 0.02486516104461113,
                                     'ROUGE-2-recall': 0.028462736257747704,
                                     'ROUGE-L-fmeasure': 0.1304430548450467,
                                     'ROUGE-L-recall': 0.14876444060374563},
                              'LexRank': {'BLEU-1': 0.15236908696095924,
                                          'BLEU-2': 0.05438131186722801,
                                          'BLEU-3': 0.023584992852121775,
                                          'B

In [13]:
# small_ds = [orig_documents[0], orig_documents[1]]
# small_ds_prep = preprocessor_simple.transform(small_ds)
# # print(small_ds_prep)
#
# # bads = string.punctuation.replace('.', '')
# # print(bads)
# #
# # tmp = remove_symbols(bads)(orig_documents[1])
# MyTextRank.fit(small_ds_prep)
# res = MyTextRank.predict(small_ds_prep)
# # print(tmp)
# # print("=" * 30)
# # print(small_ds_prep[1])
# # print("=" * 30)
# print(len(orig_documents[1].split(".")), len(nltk.sent_tokenize(orig_documents[1])))
# print("=" * 30)
# print(res[1])

100%|██████████| 2/2 [00:00<00:00, 103.39it/s]

12 8
the 37yearold has scored 230 runs in four firstclass games this season at an average of 57.50.
losing adam is naturally a blow as he contributes significantly to everything we do director of cricket angus fraser said.



