In [279]:
import torch
import spacy
import pandas as pd

from tqdm import tqdm
from summarizer import Summarizer
from transformers import BertTokenizer

In [220]:
import re
from IPython.display import display, HTML, Markdown

def display_diff(long_article, short_article, regex=r'[",.!?]', font_size=10, markdown=False, space=False):
    html = long_article
    for part in re.split(regex, short_article):
        if not part.strip(): continue
        if not space:
            html = html.replace(part, f"<font color='blue'>{part}</font>", 1)
        else:
            html = html.replace(f" {part} ", f"<font color='blue'> {part} </font>", 1)
        
    html = f"<div style='border:1px solid gray;border-radius:10px;padding:10px;font-size:{font_size}pt;line-height:{font_size+2}pt'>{html}</div>"
    if not markdown: 
        display(HTML(html.replace('\n','<br>')))
    else:
        def replace_header(match):
            header_level = len(match.group(1))
            return f'<h{header_level}>{match.group(2)}</h{header_level}>'
        html = html.replace('>','>\n')
        html = re.sub(r'^(#+)\s+(.*)$', replace_header, html, flags=re.MULTILINE)
        display(HTML(html.replace(r'\n+','<br>')))

In [222]:
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [223]:
BERT_summarize = Summarizer()

In [224]:
df = pd.read_csv('dat/cnn_dailymail_test.csv')

In [225]:
test_article = article = df['article'][0]

In [226]:
test_article_bert = BERT_summarize(test_article)

In [227]:
display_diff(test_article, test_article_bert, font_size=6)

In [228]:
# for ratio in [0.01, 0.05, 0.1, 0.3, 0.4, 0.5]:
#     print("="*30+f"{ ratio }"+"="*30)
#     test_article_bert = BERT_summarize(test_article, ratio=ratio)
#     display_diff(test_article, test_article_bert, font_size=6)

In [229]:
# for num_sentences in [1,2,3,4,5]:
#     print("="*30+f"{num_sentences}"+"="*30)
#     test_article_bert = BERT_summarize(test_article, num_sentences=num_sentences)
#     display_diff(test_article, test_article_bert, font_size=6)

In [234]:
test_arXiv_article = open("./test/article.md", "r").read()

In [235]:
test_arXiv_article_bert = BERT_summarize(test_arXiv_article, ratio=0.8)
display_diff(test_arXiv_article, test_arXiv_article_bert, font_size=6, markdown=True)

# BERT summarize testing

**Following is testing for article summairze by BERT and do ROUGE**

In [286]:
cnn_articles = df['article']
cnn_higlights = df['highlights']

# bert_summarized_articles = []
# rouge_bert_summarized_articles = []
    
max_num = 20
# for i in tqdm(range(max_num), total=max_num, desc="BERT summarize"):
for i in range(max_num):
    cnn_article = cnn_articles[i]
    cnn_higlight = cnn_higlights[i]
    # ---
    goal_num_sentence = len(list(nlp(cnn_higlight).sents))
    bert_summarized_article = BERT_summarize(cnn_article, num_sentences=goal_num_sentence)
    # bert_summarized_articles.append(bert_summarized_article)

    rouge = calculate_rouge_scores(cnn_article, bert_summarized_article) 
    # rouge_bert_summarized_articles.append(rouge)
    print(rouge)

(0.17377567140600317, 0.16798732171156897, 0.17377567140600317)
(0.2515463917525773, 0.24430641821946172, 0.2515463917525773)
(0.1760204081632653, 0.1687979539641944, 0.1760204081632653)
(0.5, 0.48905109489051096, 0.5)
(0.2458628841607565, 0.2375296912114014, 0.2458628841607565)
(0.037656903765690385, 0.033613445378151266, 0.037656903765690385)
(0.2013729977116705, 0.194954128440367, 0.2013729977116705)
(0.3511111111111111, 0.34375, 0.3511111111111111)
(0.36712328767123287, 0.3581267217630854, 0.36712328767123287)
(0.5316455696202532, 0.5128205128205128, 0.5316455696202532)
(0.5189873417721518, 0.5, 0.5189873417721518)
(0.15439856373429084, 0.14774774774774777, 0.15439856373429084)
(0.2888086642599278, 0.27536231884057966, 0.2888086642599278)
(0.16542750929368027, 0.16014897579143386, 0.16542750929368027)
(0.24358974358974358, 0.23225806451612907, 0.24358974358974358)
(0.08078994614003591, 0.07733812949640288, 0.08078994614003591)
(0.13366960907944514, 0.13147914032869784, 0.1336696090

In [291]:
cnn_articles = df['article']
cnn_higlights = df['highlights']

    
max_num = 3
for i in range(max_num):
    cnn_article = cnn_articles[i]
    cnn_higlight = cnn_higlights[i]
    # ---
    goal_num_sentence = len(list(nlp(cnn_higlight).sents))
    print("-"*10+f"{i}:(sens: {goal_num_sentence})"+"-"*10)
    
    for num_sens in range(1, int(goal_num_sentence*2)+1):
        bert_summarized_article = BERT_summarize(cnn_article, num_sentences=num_sens)
        rouge = calculate_rouge_scores(cnn_higlight, bert_summarized_article) 
        print(rouge)

----------0:(sens: 2)----------
(0.38095238095238093, 0.19672131147540986, 0.28571428571428575)
(0.33707865168539325, 0.13793103448275862, 0.2696629213483146)
(0.28571428571428564, 0.1090909090909091, 0.23214285714285715)
(0.2923076923076923, 0.109375, 0.23076923076923078)
----------1:(sens: 2)----------
(0.3720930232558139, 0.19047619047619047, 0.32558139534883723)
(0.3846153846153846, 0.1568627450980392, 0.28846153846153844)
(0.375, 0.11267605633802817, 0.29166666666666663)
(0.3612903225806452, 0.11764705882352942, 0.27096774193548384)
----------2:(sens: 3)----------
(0.14545454545454545, 0.07547169811320754, 0.10909090909090909)
(0.21978021978021978, 0.04494382022471911, 0.15384615384615385)
(0.23076923076923075, 0.05882352941176471, 0.17307692307692307)
(0.36065573770491804, 0.18333333333333335, 0.26229508196721313)
(0.29447852760736193, 0.14906832298136646, 0.19631901840490798)
(0.31578947368421056, 0.18934911242603547, 0.1871345029239766)


In [None]:
雖然 BERT summerizer 架構本身沒有文本長度限制，但是我們還是使用多層次的架構處理，並進行比較。
雖然 BART summerizer 架構本身長度限制為1024，但是我們還是使用5

In [290]:
# cnn_articles[10]
cnn_higlights[10]

"London's Metropolitan Police say the man was arrested at Luton airport after landing on a flight from Istanbul .\nHe's been charged with terror offenses allegedly committed since the start of November ."

In [280]:
from rouge_score import rouge_scorer

In [281]:
def calculate_rouge_scores(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    rouge_1 = scores['rouge1'].fmeasure
    rouge_2 = scores['rouge2'].fmeasure
    rouge_L = scores['rougeL'].fmeasure
    return rouge_1, rouge_2, rouge_L

In [282]:
rouge_bert_summarized_articles = []
for i in tqdm(range(max_num), total=max_num, desc="BERT summarize ROUGE"):
    rouge = calculate_rouge_scores(cnn_articles[i],bert_summarized_articles[i]) 
    rouge_bert_summarized_articles.append(rouge)

BERT summarize ROUGE: 100%|███████████████████████| 3/3 [00:00<00:00, 45.23it/s]


In [283]:
rouge_bert_summarized_articles

[(0.17377567140600317, 0.16798732171156897, 0.17377567140600317),
 (0.2515463917525773, 0.24430641821946172, 0.2515463917525773),
 (0.1760204081632653, 0.1687979539641944, 0.1760204081632653)]