In [2]:
# Import Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
! pip install simplet5 -q

[K     |████████████████████████████████| 1.2 MB 5.4 MB/s 
[K     |████████████████████████████████| 2.8 MB 33.2 MB/s 
[K     |████████████████████████████████| 919 kB 35.3 MB/s 
[K     |████████████████████████████████| 329 kB 43.9 MB/s 
[K     |████████████████████████████████| 829 kB 40.5 MB/s 
[K     |████████████████████████████████| 596 kB 23.8 MB/s 
[K     |████████████████████████████████| 132 kB 52.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 38.9 MB/s 
[K     |████████████████████████████████| 59 kB 6.2 MB/s 
[K     |████████████████████████████████| 895 kB 31.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 45.0 MB/s 
[K     |████████████████████████████████| 192 kB 44.5 MB/s 
[K     |████████████████████████████████| 271 kB 55.3 MB/s 
[K     |████████████████████████████████| 160 kB 50.5 MB/s 
[?25h  Building wheel for simplet5 (setup.py) ... [?25l[?25hdone
  Building wheel for future (setup.py) ... [?25l[?25hdone


In [6]:
dataset_path = '/content/drive/MyDrive/AIML /NLP/data/news_summary.csv'

In [7]:
df = pd.read_csv(dataset_path, encoding='latin-1', usecols=['headlines', 'text'])

In [None]:
df.head()

In [8]:
# simpleT5 expects dataframe to have 2 columns: "source_text" and "target_text"
df = df.rename(columns={"headlines":"target_text", "text":"source_text"})
df = df[['source_text', 'target_text']]

In [None]:
df.head()

In [9]:
# T5 Data Prep with Summarization Tax Prefix
df['source_text'] = "summarize: " + df['source_text']
df

Unnamed: 0,source_text,target_text
0,summarize: The Administration of Union Territo...,Daman & Diu revokes mandatory Rakshabandhan in...
1,summarize: Malaika Arora slammed an Instagram ...,Malaika slams user who trolled her for 'divorc...
2,summarize: The Indira Gandhi Institute of Medi...,'Virgin' now corrected to 'Unmarried' in IGIMS...
3,summarize: Lashkar-e-Taiba's Kashmir commander...,Aaj aapne pakad liya: LeT man Dujana before be...
4,summarize: Hotels in Maharashtra will train th...,Hotel staff to get training to spot signs of s...
...,...,...
4509,summarize: Fruit juice concentrate maker Rasna...,Rasna seeking ?250 cr revenue from snack categ...
4510,summarize: Former Indian cricketer Sachin Tend...,Sachin attends Rajya Sabha after questions on ...
4511,"summarize: Aamir Khan, while talking about rea...",Shouldn't rob their childhood: Aamir on kids r...
4512,summarize: The Maharashtra government has init...,"Asha Bhosle gets ?53,000 power bill for unused..."


In [10]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state = 22)
train_df.shape, test_df.shape

((3159, 2), (1355, 2))

In [11]:
# Using SimpleT5 for Model Training - Instantiate, Download Pre-trained Model
from simplet5 import SimpleT5

model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")

Global seed set to 42


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [12]:
# Model Training
model.train(train_df=train_df[:5000],
            eval_df=test_df[:100], 
            source_max_token_len=128, 
            target_max_token_len=50, 
            batch_size=8, max_epochs=5, use_gpu=True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [13]:
#output folder content
! ( cd outputs; ls )

simplet5-epoch-0-train-loss-1.6116  simplet5-epoch-3-train-loss-0.7776
simplet5-epoch-1-train-loss-1.1861  simplet5-epoch-4-train-loss-0.6357
simplet5-epoch-2-train-loss-0.9523


In [14]:
# let's load the trained model from the local output folder for inferencing:
model.load_model("t5","outputs/simplet5-epoch-4-train-loss-0.6357", use_gpu=True)

In [20]:
# usecase 1
text_to_summarize="""summarize: Inshorts is a news app that selects latest and best news from multiple national 
and international sources and summarises them to present in a short and crisp 60 words or less format,
 personalized for you, in both, English or Hindi. All summarised stories contain only headlines 
 and facts, no opinions, to help you stay informed of the current affairs. Whether it’s the 
 latest government policies or shakeups in bollywood, we get them covered and delivered 
 super fast! Get updated with the latest news and current affairs in a jiffy!
"""
model.predict(text_to_summarize)

['Inshorts summarises latest news in 60 words or less']

In [21]:
# usecase 2
text_to_summarize="""summarize: Shortpedia is a news app that aggregates latest news and 
other content such as videos & blogs and summarizes in less than 70 words and update 
you with latest news in seconds. We choose the news from multiple national and international 
sources and deliver it in personalized format to you. We have given the option to choose 
the language i.e English & Hindi.
"""
model.predict(text_to_summarize)

['Shortpedia updates you with latest news in seconds']

In [22]:
# usecase 3
text_to_summarize="""summarize: Vaccination and safety measures such as wearing face masks are essential when it comes to fighting the Delta Plus coronavirus variant, World Health Organization (WHO) representative to Russia Melita Vujnovic said.

"Vaccination plus masks, because just a vaccine is not enough with 'Delta Plus'. We need to make an effort over a short period of time, otherwise there would be a lockdown," Vujnovic said on the Soloviev Live YouTube show.

She explained that vaccination is essential because it lowers the probability of spreading the virus and lowers the risks of severe disease. However, "additional measures" will probably be required as well, Vujnovic warned.

Earlier in June, the WHO included the Delta variant in its list of coronavirus variants of concern as the strain had become prevalent and has caused a resurgence of COVID-19 cases in some countries, including Russia. India has also reported multiple cases of the Delta Plus strain, which was first discovered in March.
"""
model.predict(text_to_summarize)

['Vaccination is essential to fight Delta Plus: WHO representative']

In [18]:
# usecase 4
text_to_summarize="""summarize: Twitter’s interim resident grievance officer for India has stepped down, leaving the micro-blogging site without a grievance official as mandated by the new IT rules to address complaints from Indian subscribers, according to a source.

The source said that Dharmendra Chatur, who was recently appointed as interim resident grievance officer for India by Twitter, has quit from the post.

The social media company’s website no longer displays his name, as required under Information Technology (Intermediary Guidelines and Digital Media Ethics Code) Rules 2021.

Twitter declined to comment on the development.

The development comes at a time when the micro-blogging platform has been engaged in a tussle with the Indian government over the new social media rules. The government has slammed Twitter for deliberate defiance and failure to comply with the country’s new IT rules.
"""
model.predict(text_to_summarize)

["Twitter's India grievance officer steps down, leaving micro-blogging site"]

In [23]:
# usecase 5
text_to_summarize="""summarize: Travellers vaccinated with Covishield may not be eligible for the 
European Union’s ‘Green Pass’ that will be available for use from July 1. Many EU member states
 have started issuing the digital “vaccine passport” that will enable Europeans to move freely
  for work or tourism. The immunity passport will serve as proof that a person has been 
  vaccinated against the coronavirus disease (Covid-19), or recently tested negative for
   the virus, or has the natural immunity built up from earlier infection.Covishield, a
    version of AstraZeneca Covid vaccine manufactured by Pune-based Serum Institute of 
    India (SII), has not been approved by the EMA for the European market. The EU green 
    pass will only recognise the Vaxzervria version of the AstraZeneca vaccine that is 
    manufactured in the UK or other sites around Europe.
"""
model.predict(text_to_summarize)

['Covishield-vaccinated travellers not eligible for EU green pass']