<a href="https://colab.research.google.com/github/RoyElkabetz/Text-Summarization-with-Deep-Learning/blob/main/notebooks/T5_Summarizer_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using the T5 pre-trained model as summarization pipline for the IMDB dataset 

In this notebook:
- Dowload and save the IMDB train dataset as a CSV file.
- Download the IMDB test dataset and split it into validation and smaller test.
- Save the validation dataset as CSV.
- Summarize the reviews from smaller test dataset with different lengths using the T5-summarizer and save as CSV.

In [1]:
## uncomment only if running from google.colab
# clone the git reposetory
!git clone https://github.com/RoyElkabetz/Text-Summarization-with-Deep-Learning
# add path to .py files for import
import sys
sys.path.insert(1, "/content/Text-Summarization-with-Deep-Learning/src")

Cloning into 'Text-Summarization-with-Deep-Learning'...
remote: Enumerating objects: 349, done.[K
remote: Counting objects: 100% (349/349), done.[K
remote: Compressing objects: 100% (328/328), done.[K
remote: Total 349 (delta 186), reused 53 (delta 17), pack-reused 0[K
Receiving objects: 100% (349/349), 8.47 MiB | 5.58 MiB/s, done.
Resolving deltas: 100% (186/186), done.


In [2]:
## uncomment to mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# check GPU parameters
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [3]:
!pip install --quiet transformers==4.5.0
!pip install --quiet pytorch-lightning==1.2.7

[K     |████████████████████████████████| 2.2MB 31.8MB/s 
[K     |████████████████████████████████| 901kB 38.2MB/s 
[K     |████████████████████████████████| 3.3MB 35.9MB/s 
[K     |████████████████████████████████| 839kB 32.7MB/s 
[K     |████████████████████████████████| 122kB 45.9MB/s 
[K     |████████████████████████████████| 829kB 40.2MB/s 
[K     |████████████████████████████████| 235kB 35.7MB/s 
[K     |████████████████████████████████| 276kB 46.2MB/s 
[K     |████████████████████████████████| 1.3MB 42.4MB/s 
[K     |████████████████████████████████| 296kB 47.7MB/s 
[K     |████████████████████████████████| 143kB 41.8MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


In [4]:
import time
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from torchtext.datasets import IMDB 
from tqdm.auto import tqdm


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)


# my packages
import models
import utils

# set seed
pl.seed_everything(216)

Global seed set to 216


216

## Path to model and dataset

In [5]:
SAVE_DATASET_PATH = '/content/gdrive/MyDrive/Datasets/Text/IMDB3/'
CHECKPOINTS_PATH = '/content/gdrive/MyDrive/Checkpoints'
MY_MODEL_NAME = 'Text_Summarizer_T5-v1'
MODEL_NAME = 't5-base'
PATH_TO_LAST_CHECKPOINT = ''.join([CHECKPOINTS_PATH, '/', MY_MODEL_NAME, '.ckpt'])

## Load the T5 trained summarizer

In [6]:
# load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# load trained (on "news summary" dataset) summarizer
base_model = models.NewsSummaryModel()
trained_model = base_model.load_from_checkpoint(PATH_TO_LAST_CHECKPOINT)
trained_model.freeze()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




## Load, process, split and resave the IMDB Train, Validation and Test datasets as CSV files

In [None]:
# download the Train dataset as an iterator
data_iter = IMDB(split='train')
labels = []
texts = []

for i, (label, text) in enumerate(data_iter):
  labels.append(label)
  texts.append(text)

df = pd.DataFrame.from_dict({'label': labels, 'text': texts})
df = df.dropna()
df.head()
df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'train.csv', columns=['label', 'text'])

In [None]:
# download the Test dataset as an iterator
data_iter = IMDB(split='test')
labels = []
texts = []

for i, (label, text) in enumerate(data_iter):
  labels.append(label)
  texts.append(text)

In [None]:
df = pd.DataFrame.from_dict({'label': labels, 'text': texts})
df = df.dropna()
df.head()

Unnamed: 0,label,text
0,neg,I love sci-fi and am willing to put up with a ...
1,neg,"Worth the entertainment value of a rental, esp..."
2,neg,its a totally average film with a few semi-alr...
3,neg,STAR RATING: ***** Saturday Night **** Friday ...
4,neg,"First off let me say, If you haven't enjoyed a..."


## Split the IMDB Test dataset into Validation and Test

In [None]:
i = 500
summaries_lengths = [120, 110, 100, 90, 80, 70, 60, 50, 40, 30, 20]
negative_df = df[df['label']=='neg']
positive_df = df[df['label']=='pos']
test_df = negative_df[:i]
test_df = test_df.append(positive_df[:i], ignore_index=True)

for l in summaries_lengths:
    column_name = 'summary-' + str(l)
    test_df[column_name] = ['empty'] * len(test_df)
print(f'Size of dataframe is: {len(test_df)}')

if not Path(SAVE_DATASET_PATH + 'test.csv').is_file():
    test_df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'test.csv')

Size of dataframe is: 1000


In [None]:
columns = ['label', 'text']
valid_df = negative_df[i:]
valid_df = valid_df.append(positive_df[i:], ignore_index=True)
valid_df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'valid.csv', columns=columns)

## Summarizer Pipeline - get summaries and save as a pd.DataFrame

In [8]:
columns = ['label', 
           'text', 
           'summary-120', 
           'summary-110', 
           'summary-100', 
           'summary-90', 
           'summary-80', 
           'summary-70', 
           'summary-60', 
           'summary-50', 
           'summary-40', 
           'summary-30', 
           'summary-20']

# load the test dataset
test_df = pd.read_csv(SAVE_DATASET_PATH + 'test.csv', usecols=columns)

# shuffle negative and positive reviews
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df.head()

Unnamed: 0,label,text,summary-120,summary-110,summary-100,summary-90,summary-80,summary-70,summary-60,summary-50,summary-40,summary-30,summary-20
0,neg,"If you are in search of a masochistic thrill, ...",A movie about a man's death that is funny as h...,"A movie about a plane lame, with brief tits an...","A movie that is just plane lame, but there's n...","Rent this movie, and show it to a group of you...","A movie that is rated 'Grim"" or ""Spookies"", ha...","A movie that is rated 'Grim"" or ""Spookies"", wh...",A film about a man's death that is funny as he...,"A movie that is rated 'Grim"" or ""Spookies"", wh...","A movie that is rated 'Grim"" or ""Spookies"", wh...","Rent this movie, and show it to a group of you...",A movie about a man's death that is funny as h...
1,neg,I can not believe I even wasted a NetFlix rent...,A very sad thing to think classics like The St...,"The Stepfather film, which was released on DVD...","The Stepfather film, which was released on DVD...",A very sad thing to think classics like The St...,A very sad thing to think classics like The St...,A very sad thing to think classics like The St...,"A video of the film 'The Stepfather', which wa...",A very sad thing to think classics like The St...,"The Stepfather film, which was released on DVD...","The Stepfather, which is released on DVD, has ...","The Stepfather film, which was released on DVD..."
2,pos,"As a big Dostoyevsky fan, I had always been di...","Actor Yul Brynner, who played a Russian Major ...","Actor Yul Brynner, who played a conflicted Rus...","Actor Yul Brynner, who played a Russian Major ...","Yul Brynner, who played a dwarf Russian office...","Yul Brynner, who played a conflicted Russian o...","Actor Yul Brynner, who played a Russian Major ...","Actor Yul Brynner, who played a Russian Major ...","Actor Yul Brynner, who played a conflicted Rus...","Yul Brynner, who played a conflicted Russian o...","Actor Yul Brynner, who played a Russian Major ...","Actor Yul Brynner, who played a conflicted Rus..."
3,pos,"One of the best,Lackawanna Blues<br /><br />Gr...",The Ms Merkerson cast is so good that when it ...,The Ms Merkerson cast is so good that when it ...,The Ms Merkerson cast is extraordinary without...,"The movie 'Ms. Merkerson, which is so good tha...",The Ms Merkerson cast is so good that when it ...,"The movie 'Ms. Merkerson, which is one of the ...",The Ms Merkerson film is one of the best movie...,The Ms Merkerson cast is so good that when it ...,The Ms Merkerson film is one of the best movie...,The Ms Merkerson cast is so good that when it ...,The Ms Merkerson cast is so good that when it ...
4,neg,The first 2/3 of this film wasn't that dissimi...,"The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which...","The first 2/3 of the film 'Aztec Mummy', which..."


In [None]:
def summarizer(text, summary_max_length=128):
  # encoding text
  text_encoding = tokenizer(
      text,
      max_length=512,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors='pt'
  )

  # get predictions as ids
  generated_ids = trained_model.model.generate(
      input_ids=text_encoding['input_ids'],
      attention_mask=text_encoding['attention_mask'],
      max_length=summary_max_length,
      repetition_penalty=2.5,
      length_penalty=0.5,
      do_sample=True, 
      top_p=0.2, 
      top_k=0
  )

  # decode and join prediction
  preds = [
   tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
   for gen_id in generated_ids
  ]
  
  return ''.join(preds)

## Summarize and save

In [9]:
summaries_lengths = [120, 110, 100, 90, 80, 70, 60, 50, 40, 30, 20]
columns = list(test_df.columns.values)
with torch.no_grad():
    for i in tqdm(range(len(test_df))):
        row = test_df.iloc[i]
        for j, max_length in enumerate(summaries_lengths):
            if row[columns[j + 2]] == 'empty':
                text = row['text']
                summary = summarizer(text, summary_max_length=max_length)
                test_df.iloc[i, j + 2] = summary
        if i % 2 == 0:
            test_df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'test.csv', columns=columns)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


