In [1]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d sunnysai12345/news-summary

Downloading news-summary.zip to /content
 86% 17.0M/19.8M [00:01<00:00, 20.5MB/s]
100% 19.8M/19.8M [00:01<00:00, 14.4MB/s]


In [3]:
!unzip news-summary.zip

Archive:  news-summary.zip
  inflating: news_summary.csv        
  inflating: news_summary_more.csv   


In [4]:
!pip install -U transformers
!pip install -U sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 58.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)


In [5]:
import torch
import pickle
import pandas as pd
import re
from transformers import pipeline

In [6]:
data = pd.read_csv('/content/news_summary.csv', encoding='latin1')
data.shape

(4514, 6)

In [7]:
data.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### BART

In [56]:
summarizer_bart = pipeline("summarization",
                      model="sshleifer/distilbart-xsum-12-3",
                      device=device)

Downloading:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/716M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
bart_op = summarizer_bart(data['text'].tolist(), 
                     max_length=60, 
                     min_length=30, 
                     do_sample=False,
                     batch_size=32)

In [None]:
with open('/content/bart_op.pkl','wb') as f:
  pickle.dump(bart_op,f)

In [24]:
with open('/content/bart_op.pkl','rb') as f:
  bart_op = pickle.load(f)

### Pegasus

In [55]:
%%time
summarizer_peg = pipeline("summarization",
                      "sshleifer/distill-pegasus-xsum-16-4",
                      device=device)

Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

CPU times: user 31.6 s, sys: 7.01 s, total: 38.6 s
Wall time: 1min 31s


In [None]:
peg_op = summarizer_peg(data['text'].tolist(),
                      max_length=60,
                      min_length=30,
                      do_sample=False,
                      batch_size=32)

In [None]:
with open('/content/pegasus_op.pkl','wb') as f:
  pickle.dump(peg_op,f)

In [10]:
with open('/content/pegasus_op.pkl','rb') as f:
  peg_op = pickle.load(f)

In [11]:
peg_op[0]

{'summary_text': 'A government in the southern Indian state of Haryana has been forced to withdraw a circular that made it compulsory for women to tie raks to their male colleagues.'}

### T5

In [54]:
summarizer_t5 = pipeline("summarization",
                         "flax-community/t5-base-cnn-dm",
                         device=device)

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [None]:
t5_op = summarizer_t5(data['text'].tolist(),
                      max_length=60,
                      min_length=30,
                      do_sample=False,
                      batch_size=32)

In [None]:
with open('/content/t5_op.pkl','wb') as f:
  pickle.dump(t5_op,f)

In [19]:
with open('/content/t5_op.pkl','rb') as f:
  t5_op = pickle.load(f)

In [20]:
t5_op[0]

{'summary_text': 'Administration of Union Territory Daman and Diu revoked order on August 7 order . Mean Mean for women to tie rakhis to male colleagues on occasion of Rakshabandhan . Issued with the circular after receiving flak from employees .'}

### Load all summaries in the dataframe

In [61]:
with open('/content/bart_op.pkl','rb') as f:
  bart_op = pickle.load(f)

with open('/content/pegasus_op.pkl','rb') as f:
  peg_op = pickle.load(f)

with open('/content/t5_op.pkl','rb') as f:
  t5_op = pickle.load(f)

In [25]:
data['bart_summary'] = [item['summary_text'] for item in bart_op]
data['peg_summary'] = [item['summary_text'] for item in peg_op]
data['t5_summary'] = [item['summary_text'] for item in t5_op]

In [26]:
data.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext,bart_summary,peg_summary,t5_summary
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,A decision to make it compulsory for women to...,A government in the southern Indian state of H...,Administration of Union Territory Daman and Di...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo...",Bollywood actress Malaika Arora has hit back ...,Bollywood star Malaika Arora has hit back at a...,"Arora slams Instagram user for ""divorcing a ri..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...,A medical institute in the eastern Indian sta...,A medical institute in the eastern Indian stat...,Bihar Health Minister did not consider the ter...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,A suspected militant who was caught by securi...,One of India's most wanted Lashkar-e-Taiba mil...,Kashmir commander Abu Dujana says he won't sur...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...,Hotels in the western Indian state of Maharas...,Hotels in India are to be trained in how to sp...,Hotels will train staff to spot signs of sex t...


### Evaluating model performance using cosine similarity

In [None]:
!pip install -U sentence-transformers

In [29]:
from sentence_transformers import SentenceTransformer, util

sent_model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [30]:
#Compute embeddings
headlines_emb = sent_model.encode(data['headlines'].tolist(), convert_to_tensor=True)

In [31]:
bart_summ_emb = sent_model.encode(data['bart_summary'].tolist(), convert_to_tensor=True)

In [32]:
peg_summ_emb = sent_model.encode(data['peg_summary'].tolist(), convert_to_tensor=True)
t5_summ_emb = sent_model.encode(data['t5_summary'].tolist(), convert_to_tensor=True)

In [33]:
# compute cosine similarity in dim=0
cos_sim = torch.nn.CosineSimilarity(dim=0)
output_bart = cos_sim(headlines_emb.T, bart_summ_emb.T)

In [34]:
output_peg = cos_sim(headlines_emb.T, peg_summ_emb.T)
output_t5 = cos_sim(headlines_emb.T, t5_summ_emb.T)

In [35]:
data['cosine_bart'] = output_bart.cpu().numpy()

In [36]:
data['cosine_peg'] = output_peg.cpu().numpy()
data['cosine_t5'] = output_t5.cpu().numpy()

In [37]:
data.head(2)

Unnamed: 0,author,date,headlines,read_more,text,ctext,bart_summary,peg_summary,t5_summary,cosine_bart,cosine_peg,cosine_t5
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,A decision to make it compulsory for women to...,A government in the southern Indian state of H...,Administration of Union Territory Daman and Di...,0.546421,0.492621,0.737732
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo...",Bollywood actress Malaika Arora has hit back ...,Bollywood star Malaika Arora has hit back at a...,"Arora slams Instagram user for ""divorcing a ri...",0.508101,0.574763,0.633495


In [38]:
data['cosine_bart'].mean()

0.54043573

In [39]:
data['cosine_peg'].mean()

0.5438341

In [40]:
data['cosine_t5'].mean()

0.62276816

### Evaluate using [GLEU](https://www.nltk.org/api/nltk.translate.gleu_score.html) (Google BLEU) score

In [41]:
from nltk.translate.gleu_score import sentence_gleu

In [42]:
data['gleu_bart'] = data.apply(lambda row: sentence_gleu([row['headlines'].split()],
                                                         row['bart_summary'].split()
                                                         ), 
                               axis=1)

In [43]:
data['gleu_peg'] = data.apply(lambda row: sentence_gleu([row['headlines'].split()],
                                                         row['peg_summary'].split()
                                                         ), 
                               axis=1)

In [44]:
data['gleu_t5'] = data.apply(lambda row: sentence_gleu([row['headlines'].split()],
                                                         row['t5_summary'].split()
                                                         ), 
                               axis=1)

In [45]:
data.head(2)

Unnamed: 0,author,date,headlines,read_more,text,ctext,bart_summary,peg_summary,t5_summary,cosine_bart,cosine_peg,cosine_t5,gleu_bart,gleu_peg,gleu_t5
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,A decision to make it compulsory for women to...,A government in the southern Indian state of H...,Administration of Union Territory Daman and Di...,0.546421,0.492621,0.737732,0.011628,0.009091,0.026667
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo...",Bollywood actress Malaika Arora has hit back ...,Bollywood star Malaika Arora has hit back at a...,"Arora slams Instagram user for ""divorcing a ri...",0.508101,0.574763,0.633495,0.064103,0.182927,0.067568


In [46]:
data['gleu_bart'].mean()

0.03845916747173347

In [47]:
data['gleu_peg'].mean()

0.036387685234989864

In [48]:
data['gleu_t5'].mean()

0.04850934503082106

### Based on evaluation from Cosine Similarity and GLEU, we can say our winner model is **T5**

### Wiki Search and Summarization

In [49]:
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11695 sha256=56b4a254f6eac340b04ae720fe47af14ffe78ea4e25be213b358e08b6c60ee92
  Stored in directory: /root/.cache/pip/wheels/07/93/05/72c05349177dca2e0ba31a33ba4f7907606f7ddef303517c6a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [50]:
import wikipedia

In [51]:
# Get wiki content.
wikisearch = wikipedia.page("coronavirus")
wikicontent = wikisearch.summary

In [52]:
wikicontent

'Coronaviruses are a group of related RNA viruses that cause diseases in mammals and birds. In humans and birds, they cause respiratory tract infections that can range from mild to lethal. Mild illnesses in humans include some cases of the common cold (which is also caused by other viruses, predominantly rhinoviruses), while more lethal varieties can cause SARS, MERS and COVID-19, which is causing the ongoing pandemic. In cows and pigs they cause diarrhea, while in mice they cause hepatitis and encephalomyelitis.\nCoronaviruses constitute the subfamily Orthocoronavirinae, in the family Coronaviridae, order Nidovirales and realm Riboviria. They are enveloped viruses with a positive-sense single-stranded RNA genome and a nucleocapsid of helical symmetry. The genome size of coronaviruses ranges from approximately 26 to 32 kilobases, one of the largest among RNA viruses. They have characteristic club-shaped spikes that project from their surface, which in electron micrographs create an ima

In [57]:
summarizer_t5(wikicontent,
                      max_length=100,
                      min_length=30,
                      do_sample=False
                      )

[{'summary_text': 'In humans and birds, they cause respiratory tract infections that can range from mild to lethal . In cows and pigs they cause diarrhea while in mice they cause hepatitis and encephalomyelitis .'}]

In [58]:
summarizer_bart(wikicontent,
                      max_length=100,
                      min_length=30,
                      do_sample=False)

[{'summary_text': ' Coronaviruses are one of the most common and deadly viruses in humans and birds, according to the Department for Environment, Food and Rural Affairs.'}]

In [59]:
summarizer_peg(wikicontent,
                      max_length=100,
                      min_length=30,
                      do_sample=False)

[{'summary_text': 'The coronaviruses are a group of viruses that cause diseases in animals and birds, including SARS, MERS, SARS, SARS, SARS and SARS.'}]