In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
import pandas as pd

In [6]:
df = pd.read_csv('news_summary.csv', encoding='latin-1', engine='python')

In [7]:
len(df)

4514

In [8]:
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [9]:
df = df[['text', 'ctext']]
df.columns = ['summary', 'text']
df.dropna()
df.head()

Unnamed: 0,summary,text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [10]:
df['text'][0]

'The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ? one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ? were issued by the Dama

In [11]:
len(df['text'][1])

2382

In [12]:
len(df['summary'][1])

361

In [13]:
!pip install -q pytorch_lightning
!pip install -q transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/815.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m809.0/815.2 kB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/926.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
import torch
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader
import re
from transformers import AdamW
from sklearn.model_selection import train_test_split

In [15]:
class NewsDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, source_len, target_len):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.target_len = target_len

    def __len__(self):
        return len(self.target_texts) - 1

    def __getitem__(self, idx):
        whitespace_handler = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
        text = " ".join(str(self.source_texts[idx]).split())
        summary = " ".join(str(self.target_texts[idx]).split())

        source = self.tokenizer.batch_encode_plus([whitespace_handler(text)],
                                                max_length= self.source_len,
                                                padding='max_length',
                                                truncation=True,
                                                return_attention_mask=True,
                                                add_special_tokens=True,
                                                return_tensors='pt')

        target = self.tokenizer.batch_encode_plus([whitespace_handler(summary)],
                                                max_length = self.target_len,
                                                padding='max_length',
                                                truncation=True,
                                                return_attention_mask=True,
                                                add_special_tokens=True,
                                                return_tensors='pt')

        labels = target['input_ids']
        labels[labels == 0] = -100

        return (source['input_ids'].squeeze(),
                source['attention_mask'].squeeze(),
                labels.squeeze(),
                target['attention_mask'].squeeze())


In [16]:
class NewsDataLoader(pl.LightningDataModule):
    def __init__(self, file_path, tokenizer, batch_size, val_split_size,
                 columns_name, source_len=1024, target_len=128, corpus_size=1000):
        super().__init__()
        self.tokenizer = tokenizer
        self.file_path = file_path
        self.batch_size = batch_size
        self.split_size = val_split_size
        self.nrows = corpus_size
        self.columns_name = columns_name
        self.target_len = target_len
        self.source_len = source_len

    def prepare_data(self):
        data = pd.read_csv(self.file_path, nrows=self.nrows, encoding='latin-1')
        data = data[self.columns_name]
        data = data.dropna()
        self.target_text = data.iloc[:,0].values
        self.source_text = data.iloc[:,-1].values

    def setup(self, stage=None):
        X_train, y_train, X_val, y_val = train_test_split(
            self.source_text, self.target_text, test_size=self.split_size
        )

        self.train_dataset = (X_train, y_train)
        self.val_dataset = (X_val, y_val)

    def train_dataloader(self):
        train_data = NewsDataset(source_texts=self.train_dataset[0],
                             target_texts=self.train_dataset[1],
                             tokenizer=self.tokenizer,
                             source_len=self.source_len,
                             target_len=self.target_len
                            )
        return DataLoader(train_data, self.batch_size, num_workers=6, shuffle=True, pin_memory=True)

    def val_dataloader(self):
        val_data = NewsDataset(source_texts=self.val_dataset[0],
                          target_texts=self.val_dataset[1],
                          tokenizer=self.tokenizer,
                          source_len=self.source_len,
                          target_len=self.target_len
                          )
        return DataLoader(val_data, self.batch_size, num_workers=6, pin_memory=True)

In [92]:
class T5smallFinetuner(pl.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer

    def forward(self, input_ids, attention_mask, decoder_attention_mask=None, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )
        return outputs.loss

    def _step(self, batch):
        source_input_ids, source_attention_mask, target_input_ids, target_attention_mask = batch
        loss = self(
            input_ids=source_input_ids,
            attention_mask=source_attention_mask,
            decoder_attention_mask=target_attention_mask,
            labels=target_input_ids
        )
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log("loss", loss, prog_bar=True, on_epoch=True)  # Log the training loss
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)  # Compute validation loss
        self.log("val_loss", loss, prog_bar=True, on_epoch=True)  # Log validation loss
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }



In [93]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [94]:
dataloader = NewsDataLoader(tokenizer=tokenizer,
                            file_path='news_summary.csv',
                            val_split_size=0.3, batch_size=4, columns_name=['text', 'ctext'])

dataloader.prepare_data()

dataloader.setup()

In [95]:
for i in dataloader.train_dataloader():
  print(type(i), len(i), type(i[0]), i[2].shape)
  break



<class 'list'> 4 <class 'torch.Tensor'> torch.Size([4, 128])


In [96]:
model = T5smallFinetuner(model, tokenizer)

In [97]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)
logger = TensorBoardLogger("lightning_logs", name='summary')

In [98]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

early_stop_callback = EarlyStopping(monitor="val_loss", patience=5, verbose=False, mode="min")


In [99]:
trainer = pl.Trainer(check_val_every_n_epoch=1, max_epochs=1, accelerator='gpu',
                     callbacks=[early_stop_callback, checkpoint_callback],
                     logger=logger
                     )

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [100]:
torch.cuda.empty_cache()
trainer.fit(model, dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
0         Modules in train mode
277       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 75: 'val_loss' reached 4.26202 (best 4.26202), saving model to '/content/checkpoints/best-checkpoint.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [101]:
metric = trainer.callback_metrics
loss = metric['val_loss']
float(loss)

4.26202392578125

In [102]:
def summarizeText(text):
    whitespace_handler = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
    text_encoding = tokenizer(
        whitespace_handler(text),
        max_length=400,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    generated_ids = model.model.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=100,
        num_beams=4,
        no_repeat_ngram_size=2,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            for gen_id in generated_ids
    ]
    return "".join(preds)

In [103]:
text ="""Researchers have found that regular physical activity can significantly improve mental health. Exercise reduces stress, anxiety, and depression by boosting the production of endorphins, the body's natural mood elevators. Additionally, physical activity improves sleep quality, increases energy levels, and enhances overall well-being, leading to a healthier and happier life. """
print(len(text))
summarizeText(text)

376


"mental health. Exercise reduces stress, anxiety, and depression by boosting the production of endorphins, the body's natural mood elevators. Additionally, physical activity improves sleep quality, increases energy levels and enhances overall well-being, leading to a healthier and happier life."

In [104]:
torch.save({'model_state_dict': model.state_dict(),
    'loss': loss}, 't5-small.pt')

In [105]:
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [106]:
prediction_model = T5smallFinetuner(model=base_model, tokenizer=tokenizer)

In [107]:
state_dict = torch.load('/content/t5-small.pt')

  state_dict = torch.load('/content/t5-small.pt')


In [108]:
prediction_model.load_state_dict(state_dict['model_state_dict'])

<All keys matched successfully>

In [109]:
prediction_model.eval()

T5smallFinetuner(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_fea

In [110]:
def summarizeTextP(text, model):
    whitespace_handler = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
    text_encoding = tokenizer(
        whitespace_handler(text),
        max_length=400,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    generated_ids = model.model.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=100,
        num_beams=4,
        no_repeat_ngram_size=2,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            for gen_id in generated_ids
    ]
    return "".join(preds)

In [111]:
text ="""Rohit Sharma, the captain of the Indian cricket team, led India to a memorable victory against Australia in the recent Test series. While Jasprit Bumrah captained the team in the first Test, Sharma’s leadership and stellar performances with the bat, including scoring multiple centuries, were key to India’s success."""
print(len(text))
summarizeTextP(text, model=prediction_model)

316


"Rohit Sharma, captain of the Indian cricket team, led India to a memorable victory against Australia in the recent Test series . Sharma’s leadership and stellar performances with the bat, including scoring multiple centuries, were key to India's success."