# Load and Prepare Dataset

In [1]:
from datasets import load_dataset

# load the dataset from huggingface in streaming mode and shuffle it
cnn_data = load_dataset(
    'cnn_dailymail','3.0.0',
    split='train',
    streaming=True
).shuffle(seed=960)

In [10]:
# show the contents of a single document in the dataset
next(iter(cnn_data))

{'article': "Lewis Hamilton’s victory was his seventh of the season, three more than Nico Rosberg and two more than the Briton secured during his championship year of 2008. He has now led 63 grands prix during his 143-race career, surpassing David Coulthard’s British record. Michael Schumacher’s all-time record is 142. Lewis Hamilton poses on the podium with his trophy and champagne after winning the Singapore Grand Prix . Singapore is one of the toughest tests on the calendar given the searing heat, humidity and the race’s length — yesterday’s reached the two-hour limit before the scheduled number of laps was completed — and it appeared to take its toll on the rookies. Daniil Kyvat, 19, asked to park his Toro Rosso after his drinks bottle failed before the race started, while McLaren’s Kevin Magnussen, 21, required medical attention for burns following an unexplained build-up of heat in his cockpit. He, too, was unable to take on liquids during the race and was seen with his arm raise

In [2]:
from tqdm.auto import tqdm  # progress bar

total_doc_count = 9999

counter = 0
docs = []
# iterate through the dataset and apply our filter
for d in tqdm(cnn_data, total=total_doc_count):
    # extract the fields we need
    doc = {
        'article': d['article'],
        'highlights':d['highlights']
    }
    # add the dict containing fields we need to docs list
    docs.append(doc)

    # stop iteration once we reach 20k
    if counter == total_doc_count:
        break

    # increase the counter on every iteration
    counter += 1

  0%|          | 0/9999 [00:00<?, ?it/s]

In [3]:
import pandas as pd

# create a pandas dataframe with the documents we extracted
df = pd.DataFrame(docs)
df.head()

Unnamed: 0,article,highlights
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...


In [4]:
len(df)

100000

# Load the Model and Tokenizer

In [4]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [5]:
# Some model statistics

# max tokens including the special tokens
print(tokenizer.model_max_length)

# max tokens excluding the special tokens
print(tokenizer.max_len_single_sentence) 

# number of special tokens
print(tokenizer.num_special_tokens_to_add())

1024
1022
2


# Convert file content to sentences

In [4]:
#importing nltk and downloading punkt for extracting the sentences from the document
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samapti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Then define a function to tokenize the text
def tokenize_sentences(text):
    return nltk.tokenize.sent_tokenize(text)

# Apply the function to the column
df['tokenized_text'] = df['article'].apply(tokenize_sentences)

tokenized_text is a list of sentences.

In [7]:
df.head()

Unnamed: 0,article,highlights,tokenized_text
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...,[Lewis Hamilton’s victory was his seventh of t...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...,[Tigers may be fit and dynamic creatures who c...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...,"[As London Fashion Week gets underway, fashion..."
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...,[Using Botox diminishes the experience of feel...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...,[Former Governor Arnold Schwarzenegger lifted ...


In [11]:
def max_token(sentences):
    return sum([len(tokenizer.tokenize(sentence)) for sentence in sentences])

df['tokenized_text'].apply(max_token).max()

Token indices sequence length is longer than the specified maximum sequence length for this model (1164 > 1024). Running this sequence through the model will result in indexing errors


2617

In [12]:
df['highlights'].apply(len).max()

3384

In [13]:
df['highlights'].apply(len).median()

294.0

In [18]:
df=df.drop(columns=['tokenized_text'])

In [19]:
df.head()

Unnamed: 0,article,highlights
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...


We can't get the summary of articles whose length of tokens is higher than 1024.So for those cases, we will make chunks of articles.

In [8]:
def extract_chunks(text):
    sentences=nltk.tokenize.sent_tokenize(text)
    
    # initialize
    length = 0
    chunk = ""
    chunks = []
    count = -1
    for sentence in sentences:
      count += 1
      combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

      if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
        chunk += sentence + " " # add the sentence to the chunk
        length = combined_length # update the length counter

        # if it is the last sentence
        if count == len(sentences) - 1:
          chunks.append(chunk.strip()) # save the chunk

      else: 
        chunks.append(chunk.strip()) # save the chunk

        # reset 
        length = 0 
        chunk = ""

        # take care of the overflow sentence
        chunk += sentence + " "
        length = len(tokenizer.tokenize(sentence))
    return chunks

In [9]:
df["chunks"]=df['article'].apply(extract_chunks)

Token indices sequence length is longer than the specified maximum sequence length for this model (1164 > 1024). Running this sequence through the model will result in indexing errors


In [10]:
df

Unnamed: 0,article,highlights,tokenized_text,chunks
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...,[Lewis Hamilton’s victory was his seventh of t...,[Lewis Hamilton’s victory was his seventh of t...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...,[Tigers may be fit and dynamic creatures who c...,[Tigers may be fit and dynamic creatures who c...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...,"[As London Fashion Week gets underway, fashion...","[As London Fashion Week gets underway, fashion..."
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...,[Using Botox diminishes the experience of feel...,[Using Botox diminishes the experience of feel...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...,[Former Governor Arnold Schwarzenegger lifted ...,[Former Governor Arnold Schwarzenegger lifted ...
...,...,...,...,...
19995,Nigel Clough's brother Simon has noticed how t...,Nigel Clough has recorded impressive cup recor...,[Nigel Clough's brother Simon has noticed how ...,[Nigel Clough's brother Simon has noticed how ...
19996,The death toll from a powerful cyclone which b...,Cyclone Hudhud had 120mph winds when it made l...,[The death toll from a powerful cyclone which ...,[The death toll from a powerful cyclone which ...
19997,Sotheby's is being sued over claims a painting...,Painting sold in 2006 was attributed to a foll...,[Sotheby's is being sued over claims a paintin...,[Sotheby's is being sued over claims a paintin...
19998,Australian batsman dies aged 25 two days after...,Phillip Hughes dies two days after being hit o...,[Australian batsman dies aged 25 two days afte...,[Australian batsman dies aged 25 two days afte...


In [23]:
df_sample = df.sample(n=10, replace=False)

In [18]:
df_sample

Unnamed: 0,article,highlights,chunks,Summary_Model
7042,Silly stunt: Judy and the rubber chicken. Her ...,Judy Murray's business deals could make her mo...,[Silly stunt: Judy and the rubber chicken. Her...,Judy Murray has taken on a Grand Slam of publ...
14424,Roger Federer put on a show for his fellow spo...,Seven-time champion put on a masterclass on Ce...,[Roger Federer put on a show for his fellow sp...,Roger Federer took just 81 minutes to beat Sa...
18989,A Chinese student has died after being found s...,Body of Jiao Chin was found hanging over the f...,[A Chinese student has died after being found ...,The body of Jiao Chin was found hanging over ...
11243,New radar images have shown that British colon...,A satellite survey of area around Albemarle So...,[New radar images have shown that British colo...,New radar images have shown that British colo...
15834,The future of smoke alarms will not only know ...,AirGuard accurately distinguishes between ciga...,[The future of smoke alarms will not only know...,AirGuard accurately recognises smoke from a f...
8380,Ryan Taylor has spent the past two years on tr...,Newcastle manager Alan Pardew has confirmed Ry...,[Ryan Taylor has spent the past two years on t...,Ryan Taylor could make his first appearance o...
9481,Scobee. Josh Scobee. That’s how the Jacksonvil...,Josh Scobee has played in the NFL for 11 years...,[Scobee. Josh Scobee. That’s how the Jacksonvi...,Jacksonville Jaguars kicker Josh Scobee is th...
8572,Admission: Clive Goodman (pictured today) said...,Clive Goodman tells court he hacked voicemails...,[Admission: Clive Goodman (pictured today) sai...,Former News of the World royal. editor hacked...
12694,A teenager accused of shooting and stabbing an...,Maxwell Winkler has been arrested for the murd...,[A teenager accused of shooting and stabbing a...,"Maxwell Winkler, 17, is accused of shooting a..."
5170,Anthony Joshua believes his monastic lifestyle...,Anthony Joshua will mark his first anniversary...,[Anthony Joshua believes his monastic lifestyl...,Anthony Joshua takes on Denis Bakhtov for his...


In [20]:
df_sample.to_csv("sample_output.csv", index=False)

In [12]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")




In [15]:
df_sample

Unnamed: 0,article,highlights,chunks,Summary_Model,Summary_Model_2
0,Silly stunt: Judy and the rubber chicken. Her ...,Judy Murray's business deals could make her mo...,['Silly stunt: Judy and the rubber chicken. He...,Judy Murray has taken on a Grand Slam of publ...,Judy Murray has taken on a Grand Slam of publ...
1,Roger Federer put on a show for his fellow spo...,Seven-time champion put on a masterclass on Ce...,"[""Roger Federer put on a show for his fellow s...",Roger Federer took just 81 minutes to beat Sa...,Roger Federer took just 81 minutes to beat Sa...
2,A Chinese student has died after being found s...,Body of Jiao Chin was found hanging over the f...,"[""A Chinese student has died after being found...",The body of Jiao Chin was found hanging over ...,The body of Jiao Chin was found hanging over ...
3,New radar images have shown that British colon...,A satellite survey of area around Albemarle So...,"[""New radar images have shown that British col...",New radar images have shown that British colo...,New radar images have shown that British colo...
4,The future of smoke alarms will not only know ...,AirGuard accurately distinguishes between ciga...,"[""The future of smoke alarms will not only kno...",AirGuard accurately recognises smoke from a f...,AirGuard accurately recognises smoke from a f...
5,Ryan Taylor has spent the past two years on tr...,Newcastle manager Alan Pardew has confirmed Ry...,"[""Ryan Taylor has spent the past two years on ...",Ryan Taylor could make his first appearance o...,Ryan Taylor could make his first appearance o...
6,Scobee. Josh Scobee. That’s how the Jacksonvil...,Josh Scobee has played in the NFL for 11 years...,"[""Scobee. Josh Scobee. That’s how the Jacksonv...",Jacksonville Jaguars kicker Josh Scobee is th...,Jacksonville Jaguars kicker Josh Scobee is th...
7,Admission: Clive Goodman (pictured today) said...,Clive Goodman tells court he hacked voicemails...,"[""Admission: Clive Goodman (pictured today) sa...",Former News of the World royal. editor hacked...,Former News of the World royal. editor hacked...
8,A teenager accused of shooting and stabbing an...,Maxwell Winkler has been arrested for the murd...,"[""A teenager accused of shooting and stabbing ...","Maxwell Winkler, 17, is accused of shooting a...","Maxwell Winkler, 17, is accused of shooting a..."
9,Anthony Joshua believes his monastic lifestyle...,Anthony Joshua will mark his first anniversary...,"[""Anthony Joshua believes his monastic lifesty...",Anthony Joshua takes on Denis Bakhtov for his...,Anthony Joshua takes on Denis Bakhtov for his...


In [23]:
import ast

df_sample["chunks"]=df_sample["chunks"].apply(ast.literal_eval)

Now we get summary for the chunks and then merge them to get the summary for large articles.

In [24]:
import time

# Record start time
start_time = time.time()

def gen_sum(chunks):
    for chunk in chunks:
        outputs=""
        output=summarizer(chunk, max_length=250, min_length=30, do_sample=False)[0]['summary_text']
        outputs=outputs+output+" "
    return outputs

df_sample["Summary_Model_2"]=df_sample["chunks"].apply(gen_sum)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print("Execution time:", (elapsed_time/60), "minutes")

Execution time: 9.253791276613871 minutes


In [25]:
df_sample

Unnamed: 0,article,highlights,chunks,Summary_Model,Summary_Model_2
0,Silly stunt: Judy and the rubber chicken. Her ...,Judy Murray's business deals could make her mo...,[Silly stunt: Judy and the rubber chicken. Her...,Judy Murray has taken on a Grand Slam of publ...,Andy Murray's mum Judy has taken on a Grand Sl...
1,Roger Federer put on a show for his fellow spo...,Seven-time champion put on a masterclass on Ce...,[Roger Federer put on a show for his fellow sp...,Roger Federer took just 81 minutes to beat Sa...,Roger Federer beat Santiago Giraldo 6-3 6-1 6-...
2,A Chinese student has died after being found s...,Body of Jiao Chin was found hanging over the f...,[A Chinese student has died after being found ...,The body of Jiao Chin was found hanging over ...,Jiao Chin was found slumped over an electric f...
3,New radar images have shown that British colon...,A satellite survey of area around Albemarle So...,[New radar images have shown that British colo...,New radar images have shown that British colo...,Roanoke Colony was an attempt by Queen Elizabe...
4,The future of smoke alarms will not only know ...,AirGuard accurately distinguishes between ciga...,[The future of smoke alarms will not only know...,AirGuard accurately recognises smoke from a f...,AirGuard accurately recognises smoke from a fi...
5,Ryan Taylor has spent the past two years on tr...,Newcastle manager Alan Pardew has confirmed Ry...,[Ryan Taylor has spent the past two years on t...,Ryan Taylor could make his first appearance o...,Ryan Taylor has not played in the Premier Leag...
6,Scobee. Josh Scobee. That’s how the Jacksonvil...,Josh Scobee has played in the NFL for 11 years...,[Scobee. Josh Scobee. That’s how the Jacksonvi...,Jacksonville Jaguars kicker Josh Scobee is th...,Josh Scobee is Jacksonville Jaguars' all-time ...
7,Admission: Clive Goodman (pictured today) said...,Clive Goodman tells court he hacked voicemails...,[Admission: Clive Goodman (pictured today) sai...,Former News of the World royal. editor hacked...,Former News of the World royal . editor hacked...
8,A teenager accused of shooting and stabbing an...,Maxwell Winkler has been arrested for the murd...,[A teenager accused of shooting and stabbing a...,"Maxwell Winkler, 17, is accused of shooting a...",Henry Kim's body was found by a dogwalker at W...
9,Anthony Joshua believes his monastic lifestyle...,Anthony Joshua will mark his first anniversary...,[Anthony Joshua believes his monastic lifestyl...,Anthony Joshua takes on Denis Bakhtov for his...,Anthony Joshua takes on Denis Bakhtov at the O...


In [26]:
df_sample.to_csv("sample_output.csv", index=False)

In [27]:
model_out_2 = df_sample["Summary_Model_2"].tolist()

reference = df_sample["highlights"].tolist()

from rouge import Rouge

rouge = Rouge()

rouge.get_scores(model_out_2, reference, avg=True)

{'rouge-1': {'r': 0.40394579041426343,
  'p': 0.3624661966084476,
  'f': 0.3773717029035086},
 'rouge-2': {'r': 0.16167649263443787,
  'p': 0.13986373813334135,
  'f': 0.14863700630685708},
 'rouge-l': {'r': 0.3696341646475293,
  'p': 0.33407326833685397,
  'f': 0.34687733838745827}}

Rogue score. We need to check for more efficient metric as well.