# Automatic Text Summarization Algorithm

##### Sebastianus Radhya, Frederick Gervaise Harianto

In [1]:
#NLTK Term Weights
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq  

def generate_nltk(raw_text):
    stopWords = set(stopwords.words("english"))
    word_frequencies = {}  
    for word in nltk.word_tokenize(raw_text):  
        if word not in stopWords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
    maximum_frequency = max(word_frequencies.values())
    print(maximum_frequency)
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

    sentence_list = nltk.sent_tokenize(raw_text)
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(3, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)  
    return summary

In [2]:
#Sumy LexRank
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer 
import heapq

def generate_sumy(raw_text):
    parser = PlaintextParser.from_string(raw_text,Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 3)
    summary_results = " ".join(map(str, summary))
    return summary_results

In [3]:
#BART
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

if torch.cuda.is_available():
   device = torch.device("cuda")
else:
   device = torch.device("cuda")

def generate_bart(raw_text):
    bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
    bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
    
    input_text = ' '.join(raw_text.split())
    input_tokenized = bart_tokenizer.encode(input_text, return_tensors='pt').to(device)
    
    summary_ids = bart_model.generate(input_tokenized,
                                    num_beams = 4,
                                    num_return_sequences = 1,
                                    no_repeat_ngram_size = 2,
                                    length_penalty = 1,
                                    min_length = 0,
                                    max_length = 128,
                                    early_stopping = True)
    
    output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    return "".join(output)

  example_input = torch.tensor([[-3, -2, -1], [0, 1, 2]])


In [4]:
from datasets import load_dataset
raw_datasets = load_dataset("cnn_dailymail", '3.0.0')

Reusing dataset cnn_dailymail (C:\Users\sebas\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:01<00:00,  2.99it/s]


#### BART Evaluation

In [5]:
reference = raw_datasets["validation"][10]

In [6]:
reference_text = reference['article']

In [7]:
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_bart(reference_text), reference['highlights'])

In [8]:
print(scores)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-2': {'r': 1.0, 'p': 0.9642857142857143, 'f': 0.9818181768198347}, 'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}}]


In [9]:
generate_bart(reference_text)

'This page includes the show Transcript. Use the Transcript to help students with reading comprehension and vocabulary. At the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN student News Roll Call.'

In [10]:
reference['highlights']

'This page includes the show Transcript .\nUse the Transcript to help students with reading comprehension and vocabulary .\nAt the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call.'

In [11]:
reference = raw_datasets["validation"][4]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_bart(reference_text), reference['highlights'])
print(scores)

[{'rouge-1': {'r': 0.5333333333333333, 'p': 0.24242424242424243, 'f': 0.33333332903645835}, 'rouge-2': {'r': 0.21428571428571427, 'p': 0.08823529411764706, 'f': 0.1249999958680557}, 'rouge-l': {'r': 0.4666666666666667, 'p': 0.21212121212121213, 'f': 0.29166666236979166}}]


In [12]:
reference = raw_datasets["validation"][32]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_bart(reference_text), reference['highlights'])
print(scores)

[{'rouge-1': {'r': 0.7619047619047619, 'p': 0.48484848484848486, 'f': 0.5925925878395063}, 'rouge-2': {'r': 0.5454545454545454, 'p': 0.35294117647058826, 'f': 0.4285714238010204}, 'rouge-l': {'r': 0.7619047619047619, 'p': 0.48484848484848486, 'f': 0.5925925878395063}}]


### Rouge Score Average for BART
#### Rouge-1
Recall: 0.765
Precision: 0.576
F-Score: 0.642

#### Rouge-2
Recall: 0.5865
Precision: 0.467
F-Score: 0.512

#### Rouge-L
Recall: 0.743
Precision: 0.566
F-Score: 0.628

### NLTK Evaluation

In [13]:
reference = raw_datasets["validation"][10]

In [14]:
reference_text = reference['article']

In [15]:
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_nltk(reference_text), reference['highlights'])
print(scores)

13
[{'rouge-1': {'r': 0.36585365853658536, 'p': 0.28846153846153844, 'f': 0.32258064023124067}, 'rouge-2': {'r': 0.16666666666666666, 'p': 0.14285714285714285, 'f': 0.1538461488757398}, 'rouge-l': {'r': 0.2682926829268293, 'p': 0.21153846153846154, 'f': 0.23655913485489663}}]


In [16]:
generate_nltk(reference_text)

13


"For a chance to be mentioned on the next CNN Student News, comment on the bottom of this page with your school name, mascot, city and state. We're visiting Italy, Russia, the United Arab Emirates, and the Himalayan Mountains. Find out who's attempting to circumnavigate the globe in a plane powered partially by the sun, and explore the mysterious appearance of craters in northern Asia."

In [17]:
reference['highlights']

'This page includes the show Transcript .\nUse the Transcript to help students with reading comprehension and vocabulary .\nAt the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call.'

In [18]:
reference = raw_datasets["validation"][4]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_nltk(reference_text), reference['highlights'])
print(scores)

30
[{'rouge-1': {'r': 0.4666666666666667, 'p': 0.1076923076923077, 'f': 0.17499999695312507}, 'rouge-2': {'r': 0.14285714285714285, 'p': 0.02631578947368421, 'f': 0.04444444181728411}, 'rouge-l': {'r': 0.4666666666666667, 'p': 0.1076923076923077, 'f': 0.17499999695312507}}]


In [19]:
reference = raw_datasets["validation"][32]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_nltk(reference_text), reference['highlights'])
print(scores)

12
[{'rouge-1': {'r': 0.23809523809523808, 'p': 0.08928571428571429, 'f': 0.12987012590318783}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.23809523809523808, 'p': 0.08928571428571429, 'f': 0.12987012590318783}}]


### Rouge Score Average for NLTK
#### Rouge-1
Recall: 0.357
Precision: 0.161
F-Score: 0.209

#### Rouge-2
Recall: 0.103
Precision: 0.056
F-Score: 0.066

#### Rouge-L
Recall: 0.324
Precision: 0.136
F-Score: 0.181

### Sumy Evaluation

In [20]:
reference = raw_datasets["validation"][10]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_sumy(reference_text), reference['highlights'])
print(scores)

[{'rouge-1': {'r': 0.4146341463414634, 'p': 0.3617021276595745, 'f': 0.3863636313868802}, 'rouge-2': {'r': 0.18518518518518517, 'p': 0.1724137931034483, 'f': 0.17857142357780628}, 'rouge-l': {'r': 0.3902439024390244, 'p': 0.3404255319148936, 'f': 0.3636363586596075}}]


In [21]:
generate_sumy(reference_text)

"On this page you will find today's show Transcript and a place for you to request to be on the CNN Student News Roll Call. CNN Student News is created by a team of journalists who consider the Common Core State Standards, national standards in different subject areas, and state standards when producing the show. Thank you for using CNN Student News!"

In [22]:
reference['highlights']

'This page includes the show Transcript .\nUse the Transcript to help students with reading comprehension and vocabulary .\nAt the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call.'

In [23]:
reference = raw_datasets["validation"][4]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_sumy(reference_text), reference['highlights'])
print(scores)

[{'rouge-1': {'r': 0.06666666666666667, 'p': 0.027777777777777776, 'f': 0.039215682122261106}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.06666666666666667, 'p': 0.027777777777777776, 'f': 0.039215682122261106}}]


In [24]:
reference = raw_datasets["validation"][32]
reference_text = reference['article']
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(generate_sumy(reference_text), reference['highlights'])
print(scores)

[{'rouge-1': {'r': 0.5238095238095238, 'p': 0.25, 'f': 0.338461534087574}, 'rouge-2': {'r': 0.36363636363636365, 'p': 0.16, 'f': 0.22222221797839511}, 'rouge-l': {'r': 0.47619047619047616, 'p': 0.22727272727272727, 'f': 0.3076923033183432}}]


### Rouge Score Average for Sumy
#### Rouge-1
Recall: 0.335
Precision: 0.213
F-Score: 0.254

#### Rouge-2
Recall: 0.183
Precision: 0.110
F-Score: 0.134

#### Rouge-L
Recall: 0.311
Precision: 0.198
F-Score: 0.237