In [3]:
import Generate_summary_NLP as simplifier
import os
from tqdm import tqdm
import pickle
import time
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

In [4]:
num_test = 3
name_to_save = "evaluate_result_" + str(num_test) + "news.pkl"

In [5]:
# load model
model_name = "google/pegasus-cnn_dailymail"
model = PegasusForConditionalGeneration.from_pretrained(model_name).to("cpu")
tokenizer = PegasusTokenizer.from_pretrained(model_name)
print("model and tokenizer loaded")

model and tokenizer loaded


In [6]:
# load original dataset (pickle)
with open("../coreference_resolution/data/sample_news.pkl", "rb") as f:
    news_list = pickle.load(f)

news_list = news_list[:100]
len(news_list)

3

In [7]:
!pip install tdqm



In [None]:
# generate and simplify

print("number of articles to process is ", num_test, '\n')

total_news_list = []

ts = time.time()
for idx, news in tqdm(enumerate(news_list)):

    # stop at chosen num_test
    if idx == num_test:
        break

    # progress printing
    print("Start ", idx)
    if (idx != 0) and (idx % 100 == 0):
        print("Writing story {} of {}; {:.2f} percent done. Time spent: {:.2f}".format(
            idx, num_test, float(idx) * 100.0 / float(num_test), time.time() - ts))


    print("Start 2", idx)
    news_dict = dict()
    content = news['content']
    ref_summary = news['summary']

    # generate summary for original input (post-process)
    print("Start 3", idx)
    inputs = tokenizer(content, max_length=1024, return_tensors="pt", truncation=True).to("cpu")
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, max_length=50)
    gen_summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    # simplify the summary (post-process)
    gen_sim_summary = simplifier.simplify(gen_summary)

    # simplify the input (pre-process)
    sim_content = simplifier.simplify(content)

    # generate summary for simplified input (pre-process)
    print("Start 4", idx)

    inputs = tokenizer(sim_content, max_length=1024, return_tensors="pt", truncation=True, padding=True).to("cpu")
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, max_length=50)
    sim_gen_summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[
        0]

    # combine all into dict
    print("Start 5", idx)
    news_dict["ori_content"] = content
    news_dict["ref_summary"] = ref_summary
    news_dict["gen_summary"] = gen_summary
    news_dict["gen_sim_summary"] = gen_sim_summary
    news_dict["sim_content"] = sim_content
    news_dict["sim_gen_summary"] = sim_gen_summary

    # append to a global list
    total_news_list.append(news_dict)

print("DONE! generate and simplify")

number of articles to process is  3 



0it [00:00, ?it/s]

Start  0
Start 2 0
Start 3 0
Start 4 0


In [5]:
from utils.eval_simplicity import cal_fkgl, cal_wordfreq, cal_wordfreq2, cal_BERTscore, cal_word_count, cal_SBERT_cosine_score

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 473kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 181kB/s]
Downloading: 100%|██████████|

In [6]:
import pickle
with open("../coreference_resolution/data/news_100.pkl", "rb") as f:
    news_list = pickle.load(f)
news_list

[{'ori_content': "Marseille , France -LRB- CNN -RRB- The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane . Marseille prosecutor Brice Robin told CNN that `` so far no videos were used in the crash investigation . '' He added , `` A person who has such a video needs to immediately give it to the investigators . '' Robin 's comments follow claims by two magazines , German daily Bild and French Paris Match , of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps . All 150 on board were killed . Paris Match and Bild reported that the video was recovered from a phone at the wreckage site . The two publications described the supposed video , but did not post it on their websites . The publications said that they watched the video , which was found by a source close to the investigation . `` One c

In [7]:
ref_list = []
#ori_list = []
sim_list = []

for i in range(len(news_list)):
    ref_sum = news_list[i]["ref_summary"]
    #ori_sum = news_list[i]["ori_summary"]
    sim_sum = news_list[i]["sim_gen_summary"]
    ref_list.append(ref_sum)
    #ori_list.append(ori_sum)
    sim_list.append(sim_sum)

In [8]:
# BERT Score
score = cal_SBERT_cosine_score(ref_text=ref_list, pred_text=sim_list)

100it [00:02, 35.63it/s]
100it [00:00, 30185.71it/s]

S-BERT cosine similarity Score
the higher, the better 

S-BERT cosine similarity Score between text = 0.589

time spent in calculation:0:00:02.812824





In [34]:
cal_wf = cal_wordfreq()
wordfreq_score3 = cal_wf.score(text_list=sim_list, title="phase 2 - output 2 (simplified)", printtop=True)
wordfreq_score3

Word Frequency -- the higher, the easier 

score of phase 2 - output 2 (simplified) = 4.679


(4.678875268096466,
 {'reference': array([4.86142857, 4.59666667, 4.96277778, 4.92238095, 4.55375   ,
         4.900625  , 4.83470588, 4.57388889, 4.75705882, 4.63428571,
         4.94857143, 4.5655    , 4.36      , 4.55047619, 5.11470588,
         5.01588235, 4.73590909, 4.25285714, 4.83666667, 4.84944444,
         4.73434783, 4.46214286, 4.4925    , 4.80882353, 4.652     ,
         4.6065    , 5.05043478, 4.63478261, 4.29952381, 4.96913043,
         4.69      , 5.05125   , 5.10388889, 4.96705882, 4.51857143,
         4.37529412, 4.76777778, 4.31058824, 4.92571429, 5.035     ,
         4.353     , 4.76388889, 4.60823529, 4.6965    , 4.67714286,
         4.101     , 4.737     , 4.47952381, 4.48434783, 5.175     ,
         4.81210526, 4.63571429, 4.70583333, 4.5648    , 4.45294118,
         4.52      , 4.89105263, 4.57125   , 4.79952381, 4.50444444,
         4.73047619, 4.55333333, 5.1775    , 4.65235294, 4.86375   ,
         4.06538462, 4.43095238, 4.94421053, 4.6335    , 4.85380952,
 

In [10]:
fkgl_score3 = cal_fkgl(text_list=sim_list, title="phase 2 - output 2 (simplified)", printtop=True)

Flesch–Kincaid Grade Level -- the lower, the easier (output is a grade level) 

score of phase 2 - output 2 (simplified) = 6.180


In [26]:
import eval_rouge
rouge_score = eval_rouge.cal_rouge(gold_list=ref_list, cand_list=sim_list)
print(len(rouge_score['rouge1']), len(rouge_score['rouge2']), len(rouge_score['rougeL']))
rouge_score['rouge1'].mean(), rouge_score['rouge2'].mean(), rouge_score['rougeL'].mean()

100it [00:00, 1451.00it/s]

time spend cal_rouge:0.07
100 100 100





(0.26616276905428116, 0.09049775813006818, 0.24353089106757111)