<a href="https://colab.research.google.com/github/RoyElkabetz/Text-Summarization-with-Deep-Learning/blob/main/T5_Summarizer_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## uncomment to mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!nvidia-smi

Sun Jun 27 17:48:10 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install --quiet transformers==4.5.0
!pip install --quiet pytorch-lightning==1.2.7

[K     |████████████████████████████████| 2.2MB 38.0MB/s 
[K     |████████████████████████████████| 901kB 46.6MB/s 
[K     |████████████████████████████████| 3.3MB 55.2MB/s 
[K     |████████████████████████████████| 839kB 31.0MB/s 
[K     |████████████████████████████████| 829kB 52.6MB/s 
[K     |████████████████████████████████| 122kB 54.3MB/s 
[K     |████████████████████████████████| 276kB 54.7MB/s 
[K     |████████████████████████████████| 276kB 47.4MB/s 
[K     |████████████████████████████████| 1.3MB 50.3MB/s 
[K     |████████████████████████████████| 143kB 60.0MB/s 
[K     |████████████████████████████████| 296kB 53.3MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


In [4]:
import json
import time
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
from torchtext.datasets import AG_NEWS, IMDB 

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)

from tqdm.auto import tqdm

pl.seed_everything(216)

Global seed set to 216


216

In [5]:
# plotting packages 
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10

In [6]:
SAVE_DATASET_PATH = '/content/gdrive/MyDrive/Datasets/Text/IMDB_train_with_summary_dataset.csv'
CHECKPOINTS_PATH = '/content/gdrive/MyDrive/Checkpoints'
MY_MODEL_NAME = 'Text_Summarizer_T5'
MODEL_NAME = 't5-base'
PATH_TO_LAST_CHECKPOINT = ''.join([CHECKPOINTS_PATH, '/', MY_MODEL_NAME, '-v1.ckpt'])

In [7]:
# load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




In [8]:
# A class for the model
class NewsSummaryModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):

    output = self.model(
        input_ids,
        attention_mask=attention_mask,
        labels=labels,
        decoder_attention_mask=decoder_attention_mask
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs = self(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )

    self.log('train_loss', loss, prog_bar=True, logger=True)
    return loss
  
  def validation_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs = self(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )

    self.log('valid_loss', loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs = self(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )

    self.log('test_loss', loss, prog_bar=True, logger=True)
    return loss
    
    
  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)

In [9]:
# init model
model = NewsSummaryModel()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




In [10]:
checkpoint_callback = ModelCheckpoint(
    dirpath=CHECKPOINTS_PATH,
    filename=MY_MODEL_NAME,
    save_top_k=1,
    verbose=True,
    monitor='valid_loss',
    mode='min'
)

trainer = pl.Trainer(
    checkpoint_callback=checkpoint_callback,
    max_epochs=1,
)

# trainer = pl.Trainer(
#     checkpoint_callback=checkpoint_callback,
#     max_epochs=1,
#     gpus=1,
#     progress_bar_refresh_rate=30
# )

GPU available: True, used: False
TPU available: False, using: 0 TPU cores


In [11]:
# load best checkpoint of T5 summarizer
trained_model = NewsSummaryModel.load_from_checkpoint(PATH_TO_LAST_CHECKPOINT)
trained_model.freeze()

In [27]:
# # down load dataset as an iterator
# train_iter = IMDB(split='train')
# labels = []
# texts = []

# for i, (label, text) in enumerate(train_iter):
#   labels.append(label)
#   texts.append(text[:512])

# summary = ['empty'] * len(texts)

## create a Pandas DataFrame of data


In [28]:
# train_df = pd.DataFrame.from_dict({'label': labels, 'text': texts, 'summary': summary})
# train_df = train_df.dropna()
# train_df.head()

Unnamed: 0,label,text,summary
0,neg,I rented I AM CURIOUS-YELLOW from my video sto...,empty
1,neg,"""I Am Curious: Yellow"" is a risible and preten...",empty
2,neg,If only to avoid making this type of film in t...,empty
3,neg,This film was probably inspired by Godard's Ma...,empty
4,neg,"Oh, brother...after hearing about this ridicul...",empty


In [30]:
# # check a single example
# with torch.no_grad():
#   sample_row = train_df.iloc[0]
#   text = sample_row['text']
#   model_summary = summarizer(text)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [31]:
# text

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to maki'

In [32]:
# model_summary

"I AM CURIOUS-YELLOW is a film about a young Swedish drama student named Lena who wants to learn everything she can about life. The plot is centered around a maki girl named Maki who wants to focus her attentions on the maki boy's maki, which is a game of maki. The film was released in 1967 and has been rated 4/5 (Sweden)."

## Summarizer Pipeline - get summary and save as a pd.DataFrame

In [19]:
train_df = pd.read_csv(SAVE_DATASET_PATH ,encoding='utf-8')
columns = ['label', 'text', 'summary']
train_df = train_df[columns]
train_df.head()

Unnamed: 0,label,text,summary
0,neg,I rented I AM CURIOUS-YELLOW from my video sto...,I AM CURIOUS-YELLOW is a film about a young Sw...
1,neg,"""I Am Curious: Yellow"" is a risible and preten...","""I Am Curious: Yellow"" is a risible and preten..."
2,neg,If only to avoid making this type of film in t...,The film is interesting as an experiment but t...
3,neg,This film was probably inspired by Godard's Ma...,Actress Lena Nyman has to be the most annoying...
4,neg,"Oh, brother...after hearing about this ridicul...",After hearing about this ridiculous film for u...


In [None]:
with torch.no_grad():
  for i in tqdm(range(12500, len(train_df))):
      row = train_df.iloc[i]
      if row['summary'] == 'empty':
        text = row['text']
        summary = summarizer(text)
        train_df.iloc[i, 2] = summary
        if i % 10 == 0:
          train_df.to_csv(path_or_buf=SAVE_DATASET_PATH, columns=columns)

HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))

In [27]:
train_df.iloc[5961, 2]

'empty'