# Load and Prepare Dataset

In [2]:
from datasets import load_dataset

# load the dataset from huggingface in streaming mode and shuffle it
cnn_data = load_dataset(
    'cnn_dailymail','3.0.0',
    split='train',
    streaming=True
).shuffle(seed=960)

In [3]:
# show the contents of a single document in the dataset
next(iter(cnn_data))

{'article': "Lewis Hamilton’s victory was his seventh of the season, three more than Nico Rosberg and two more than the Briton secured during his championship year of 2008. He has now led 63 grands prix during his 143-race career, surpassing David Coulthard’s British record. Michael Schumacher’s all-time record is 142. Lewis Hamilton poses on the podium with his trophy and champagne after winning the Singapore Grand Prix . Singapore is one of the toughest tests on the calendar given the searing heat, humidity and the race’s length — yesterday’s reached the two-hour limit before the scheduled number of laps was completed — and it appeared to take its toll on the rookies. Daniil Kyvat, 19, asked to park his Toro Rosso after his drinks bottle failed before the race started, while McLaren’s Kevin Magnussen, 21, required medical attention for burns following an unexplained build-up of heat in his cockpit. He, too, was unable to take on liquids during the race and was seen with his arm raise

In [4]:
from tqdm.auto import tqdm  # progress bar

total_doc_count = 19999

counter = 0
docs = []
# iterate through the dataset and apply our filter
for d in tqdm(cnn_data, total=total_doc_count):
    # extract the fields we need
    doc = {
        'article': d['article'],
        'highlights':d['highlights']
    }
    # add the dict containing fields we need to docs list
    docs.append(doc)

    # stop iteration once we reach 20k
    if counter == total_doc_count:
        break

    # increase the counter on every iteration
    counter += 1

  0%|          | 0/19999 [00:00<?, ?it/s]

In [5]:
import pandas as pd

# create a pandas dataframe with the documents we extracted
df = pd.DataFrame(docs)
df.head()

Unnamed: 0,article,highlights
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...


# Load the Model and Tokenizer

In [6]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "sshleifer/distilbart-cnn-12-6"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [7]:
# Some model statistics

# max tokens including the special tokens
print(tokenizer.model_max_length)

# max tokens excluding the special tokens
print(tokenizer.max_len_single_sentence) 

# number of special tokens
print(tokenizer.num_special_tokens_to_add())

1024
1022
2


# Convert file content to sentences

In [10]:
#importing nltk and downloading punkt for extracting the sentences from the document
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samapti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# Then define a function to tokenize the text
def tokenize_sentences(text):
    return nltk.tokenize.sent_tokenize(text)

# Apply the function to the column
df['tokenized_text'] = df['article'].apply(tokenize_sentences)

In [12]:
df.head()

Unnamed: 0,article,highlights,tokenized_text
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...,[Lewis Hamilton’s victory was his seventh of t...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...,[Tigers may be fit and dynamic creatures who c...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...,"[As London Fashion Week gets underway, fashion..."
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...,[Using Botox diminishes the experience of feel...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...,[Former Governor Arnold Schwarzenegger lifted ...


In [14]:
def max_token(sentences):
    return sum([len(tokenizer.tokenize(sentence)) for sentence in sentences])

df['tokenized_text'].apply(max_token).max()

Token indices sequence length is longer than the specified maximum sequence length for this model (1164 > 1024). Running this sequence through the model will result in indexing errors


2617

In [15]:
df['highlights'].apply(len).max()

3384

In [16]:
df['highlights'].apply(len).median()

294.0

In [18]:
df=df.drop(columns=['tokenized_text'])

In [19]:
df.head()

Unnamed: 0,article,highlights
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...


In [8]:
def tokenize_sentences(text):
    sentences=nltk.tokenize.sent_tokenize(text)
    
    # initialize
    length = 0
    chunk = ""
    chunks = []
    count = -1
    for sentence in sentences:
      count += 1
      combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

      if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
        chunk += sentence + " " # add the sentence to the chunk
        length = combined_length # update the length counter

        # if it is the last sentence
        if count == len(sentences) - 1:
          chunks.append(chunk.strip()) # save the chunk

      else: 
        chunks.append(chunk.strip()) # save the chunk

        # reset 
        length = 0 
        chunk = ""

        # take care of the overflow sentence
        chunk += sentence + " "
        length = len(tokenizer.tokenize(sentence))
    return chunks

In [11]:
df["chunks"]=df['article'].apply(tokenize_sentences)

Token indices sequence length is longer than the specified maximum sequence length for this model (1164 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
df

Unnamed: 0,article,highlights,chunks
0,Lewis Hamilton’s victory was his seventh of th...,Hamilton has surpassed David Coulthard's Briti...,[Lewis Hamilton’s victory was his seventh of t...
1,Tigers may be fit and dynamic creatures who ca...,Tiger takes a ride on safari car in South Kore...,[Tigers may be fit and dynamic creatures who c...
2,"As London Fashion Week gets underway, fashioni...",Christian Cowan-Sanluis collaborated with Acer...,"[As London Fashion Week gets underway, fashion..."
3,Using Botox diminishes the experience of feeli...,Aesthetic nurse Helen Collier says young peopl...,[Using Botox diminishes the experience of feel...
4,Former Governor Arnold Schwarzenegger lifted t...,Page Six reports that a lapel pin that once po...,[Former Governor Arnold Schwarzenegger lifted ...
...,...,...,...
19995,Nigel Clough's brother Simon has noticed how t...,Nigel Clough has recorded impressive cup recor...,[Nigel Clough's brother Simon has noticed how ...
19996,The death toll from a powerful cyclone which b...,Cyclone Hudhud had 120mph winds when it made l...,[The death toll from a powerful cyclone which ...
19997,Sotheby's is being sued over claims a painting...,Painting sold in 2006 was attributed to a foll...,[Sotheby's is being sued over claims a paintin...
19998,Australian batsman dies aged 25 two days after...,Phillip Hughes dies two days after being hit o...,[Australian batsman dies aged 25 two days afte...


In [15]:
df_sample = df.sample(n=10, replace=False)

In [16]:
import time

# Record start time
start_time = time.time()

def generate_summary(chunks):
    tokenized_inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]
    outputs=""
    for tokenized_input in tokenized_inputs:
        output_tn = model.generate(**tokenized_input)
        output=tokenizer.decode(*output_tn, skip_special_tokens=True)
        outputs=outputs+output+" "
    return outputs

df_sample["Summary_Model"]=df_sample["chunks"].apply(generate_summary)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print("Execution time:", elapsed_time, "seconds")

Execution time: 401.97716307640076 seconds


In [17]:
print("Execution time:", (elapsed_time/60), "minutes")

Execution time: 6.699619384606679 minutes


In [18]:
df_sample

Unnamed: 0,article,highlights,chunks,Summary_Model
7042,Silly stunt: Judy and the rubber chicken. Her ...,Judy Murray's business deals could make her mo...,[Silly stunt: Judy and the rubber chicken. Her...,Judy Murray has taken on a Grand Slam of publ...
14424,Roger Federer put on a show for his fellow spo...,Seven-time champion put on a masterclass on Ce...,[Roger Federer put on a show for his fellow sp...,Roger Federer took just 81 minutes to beat Sa...
18989,A Chinese student has died after being found s...,Body of Jiao Chin was found hanging over the f...,[A Chinese student has died after being found ...,The body of Jiao Chin was found hanging over ...
11243,New radar images have shown that British colon...,A satellite survey of area around Albemarle So...,[New radar images have shown that British colo...,New radar images have shown that British colo...
15834,The future of smoke alarms will not only know ...,AirGuard accurately distinguishes between ciga...,[The future of smoke alarms will not only know...,AirGuard accurately recognises smoke from a f...
8380,Ryan Taylor has spent the past two years on tr...,Newcastle manager Alan Pardew has confirmed Ry...,[Ryan Taylor has spent the past two years on t...,Ryan Taylor could make his first appearance o...
9481,Scobee. Josh Scobee. That’s how the Jacksonvil...,Josh Scobee has played in the NFL for 11 years...,[Scobee. Josh Scobee. That’s how the Jacksonvi...,Jacksonville Jaguars kicker Josh Scobee is th...
8572,Admission: Clive Goodman (pictured today) said...,Clive Goodman tells court he hacked voicemails...,[Admission: Clive Goodman (pictured today) sai...,Former News of the World royal. editor hacked...
12694,A teenager accused of shooting and stabbing an...,Maxwell Winkler has been arrested for the murd...,[A teenager accused of shooting and stabbing a...,"Maxwell Winkler, 17, is accused of shooting a..."
5170,Anthony Joshua believes his monastic lifestyle...,Anthony Joshua will mark his first anniversary...,[Anthony Joshua believes his monastic lifestyl...,Anthony Joshua takes on Denis Bakhtov for his...


In [20]:
df_sample.to_csv("sample_output.csv", index=False)

In [29]:
!pip install rouge

Collecting rouge
  Obtaining dependency information for rouge from https://files.pythonhosted.org/packages/32/7c/650ae86f92460e9e8ef969cc5008b24798dcf56a9a8947d04c78f550b3f5/rouge-1.0.1-py3-none-any.whl.metadata
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Using cached rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [24]:
model_out = df_sample["Summary_Model"].tolist()

reference = df_sample["highlights"].tolist()

In [25]:
model_out

[' Judy Murray has taken on a Grand Slam of public appearances and lucrative business commitments that could make her more than £150,000 in a year. Her week started with an unlikely appearance on Radio 4’s Desert Island Discs. Then we witnessed her ‘ambassadorship’ for Lavazza coffee. Judy was seen helping to dispense free coffee to Wimbledon fans, and manning a coffee stall. ',
 ' Roger Federer took just 81 minutes to beat Santiago Giraldo in SW19. David Beckham, Sachin Tendulkar and Sir Bobby Charlton watched in the Royal Box. Federer joined Andy Murray in reaching round four without dropping a set. David Haye and Anthony Joshua were also in the royal box. ',
 " The body of Jiao Chin was found hanging over the fence at the Xishuangbanna Dai Prefecture in south-western China's Yunnan Province. Fellow students discovered the 22-year-old's body in the early hours of the morning. The fence had been erected around the university accommodation after a string of intruders tried to break in.

In [26]:
reference

["Judy Murray's business deals could make her more than £150,000 a year .\nHer deal with Robinsons, the drinks brand, is worth around £50,000 .\nTennis fans have in the past struggled to warm to Andy Murray's mother .",
 'Seven-time champion put on a masterclass on Centre Court .\nFederer took just 81 minutes to beat Giraldo .\nDavid Beckham and Sachin Tendulkar were among guests in Royal Box .',
 'Body of Jiao Chin was found hanging over the fence at a university campus .\nGrim discovery was made by fellow students in early hours of the morning .\nBelieved she tried to climb electric fence to sneak away to see her boyfriend .\nFence was erected to keep intruders out from accommodation .',
 "A satellite survey of area around Albemarle Sound has identified key areas of interest .\nScientists are searching these areas using ground penetrating radar (GPR)\nGPR measures the depth that signals travel before hitting a hidden object .\nSo far, they have found a previously 'undetected pattern'

In [32]:
from rouge import Rouge

rouge = Rouge()

rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.45503933168394833,
  'p': 0.3343866843386429,
  'f': 0.3804206636480796},
 'rouge-2': {'r': 0.19327438415272272,
  'p': 0.13522689616829275,
  'f': 0.1563495894631115},
 'rouge-l': {'r': 0.433472178506888,
  'p': 0.3181305198183274,
  'f': 0.3619058984742755}}

In [33]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")




model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [34]:
import time

# Record start time
start_time = time.time()

def gen_sum(chunks):
    for chunk in chunks:
        outputs=""
        output=summarizer(chunk, max_length=250, min_length=30, do_sample=False)[0]['summary_text']
        outputs=outputs+output+" "
    return outputs

df_sample["Summary_Model_2"]=df_sample["chunks"].apply(generate_summary)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print("Execution time:", (elapsed_time/60), "minutes")

Execution time: 8.022761873404185 minutes


In [35]:
df_sample

Unnamed: 0,article,highlights,chunks,Summary_Model,Summary_Model_2
7042,Silly stunt: Judy and the rubber chicken. Her ...,Judy Murray's business deals could make her mo...,[Silly stunt: Judy and the rubber chicken. Her...,Judy Murray has taken on a Grand Slam of publ...,Judy Murray has taken on a Grand Slam of publ...
14424,Roger Federer put on a show for his fellow spo...,Seven-time champion put on a masterclass on Ce...,[Roger Federer put on a show for his fellow sp...,Roger Federer took just 81 minutes to beat Sa...,Roger Federer took just 81 minutes to beat Sa...
18989,A Chinese student has died after being found s...,Body of Jiao Chin was found hanging over the f...,[A Chinese student has died after being found ...,The body of Jiao Chin was found hanging over ...,The body of Jiao Chin was found hanging over ...
11243,New radar images have shown that British colon...,A satellite survey of area around Albemarle So...,[New radar images have shown that British colo...,New radar images have shown that British colo...,New radar images have shown that British colo...
15834,The future of smoke alarms will not only know ...,AirGuard accurately distinguishes between ciga...,[The future of smoke alarms will not only know...,AirGuard accurately recognises smoke from a f...,AirGuard accurately recognises smoke from a f...
8380,Ryan Taylor has spent the past two years on tr...,Newcastle manager Alan Pardew has confirmed Ry...,[Ryan Taylor has spent the past two years on t...,Ryan Taylor could make his first appearance o...,Ryan Taylor could make his first appearance o...
9481,Scobee. Josh Scobee. That’s how the Jacksonvil...,Josh Scobee has played in the NFL for 11 years...,[Scobee. Josh Scobee. That’s how the Jacksonvi...,Jacksonville Jaguars kicker Josh Scobee is th...,Jacksonville Jaguars kicker Josh Scobee is th...
8572,Admission: Clive Goodman (pictured today) said...,Clive Goodman tells court he hacked voicemails...,[Admission: Clive Goodman (pictured today) sai...,Former News of the World royal. editor hacked...,Former News of the World royal. editor hacked...
12694,A teenager accused of shooting and stabbing an...,Maxwell Winkler has been arrested for the murd...,[A teenager accused of shooting and stabbing a...,"Maxwell Winkler, 17, is accused of shooting a...","Maxwell Winkler, 17, is accused of shooting a..."
5170,Anthony Joshua believes his monastic lifestyle...,Anthony Joshua will mark his first anniversary...,[Anthony Joshua believes his monastic lifestyl...,Anthony Joshua takes on Denis Bakhtov for his...,Anthony Joshua takes on Denis Bakhtov for his...


In [36]:
df_sample.to_csv("sample_output.csv", index=False)

In [38]:
model_out_2 = df_sample["Summary_Model_2"].tolist()

reference = df_sample["highlights"].tolist()

rouge = Rouge()

rouge.get_scores(model_out_2, reference, avg=True)

{'rouge-1': {'r': 0.45503933168394833,
  'p': 0.3343866843386429,
  'f': 0.3804206636480796},
 'rouge-2': {'r': 0.19327438415272272,
  'p': 0.13522689616829275,
  'f': 0.1563495894631115},
 'rouge-l': {'r': 0.433472178506888,
  'p': 0.3181305198183274,
  'f': 0.3619058984742755}}