<a href="https://colab.research.google.com/github/RoyElkabetz/Text-Summarization-with-Deep-Learning/blob/main/notebooks/T5_Summarizer_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## uncomment only if running from google.colab
# clone the git reposetory
!git clone https://github.com/RoyElkabetz/Text-Summarization-with-Deep-Learning
# add path to .py files for import
import sys
sys.path.insert(1, "/content/Text-Summarization-with-Deep-Learning/src")

Cloning into 'Text-Summarization-with-Deep-Learning'...
remote: Enumerating objects: 327, done.[K
remote: Counting objects: 100% (327/327), done.[K
remote: Compressing objects: 100% (309/309), done.[K
remote: Total 327 (delta 172), reused 47 (delta 14), pack-reused 0[K
Receiving objects: 100% (327/327), 8.08 MiB | 5.18 MiB/s, done.
Resolving deltas: 100% (172/172), done.


In [2]:
## uncomment to mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [4]:
!pip install --quiet transformers==4.5.0
!pip install --quiet pytorch-lightning==1.2.7

[K     |████████████████████████████████| 2.2MB 13.5MB/s 
[K     |████████████████████████████████| 901kB 37.0MB/s 
[K     |████████████████████████████████| 3.3MB 53.4MB/s 
[K     |████████████████████████████████| 839kB 12.9MB/s 
[K     |████████████████████████████████| 235kB 31.2MB/s 
[K     |████████████████████████████████| 122kB 56.4MB/s 
[K     |████████████████████████████████| 829kB 41.6MB/s 
[K     |████████████████████████████████| 276kB 63.7MB/s 
[K     |████████████████████████████████| 1.3MB 64.0MB/s 
[K     |████████████████████████████████| 296kB 70.5MB/s 
[K     |████████████████████████████████| 143kB 66.0MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


In [5]:
import time
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
from torchtext.datasets import AG_NEWS, IMDB 
from tqdm.auto import tqdm


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)


# my packages
import models
import utils

# plotting packages 
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10

# set seed
pl.seed_everything(216)

Global seed set to 216


216

In [10]:
SAVE_DATASET_PATH = '/content/gdrive/MyDrive/Datasets/Text/IMDB2/'
CHECKPOINTS_PATH = '/content/gdrive/MyDrive/Checkpoints'
MY_MODEL_NAME = 'Text_Summarizer_T5-v1'
MODEL_NAME = 't5-base'
PATH_TO_LAST_CHECKPOINT = ''.join([CHECKPOINTS_PATH, '/', MY_MODEL_NAME, '.ckpt'])

In [7]:
# load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# load trained (on "news summary" dataset) summarizer
base_model = models.NewsSummaryModel()
trained_model = base_model.load_from_checkpoint(PATH_TO_LAST_CHECKPOINT)
trained_model.freeze()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




In [62]:
# def summarizer(text, summary_max_length=150):
#   # encoding text
#   text_encoding = tokenizer(
#       text,
#       max_length=512,
#       padding='max_length',
#       truncation=True,
#       return_attention_mask=True,
#       add_special_tokens=True,
#       return_tensors='pt'
#   )

#   # get predictions as ids
#   generated_ids = trained_model.model.generate(
#       input_ids=text_encoding['input_ids'],
#       attention_mask=text_encoding['attention_mask'],
#       max_length=summary_max_length,
#       num_beams=10,
#       repetition_penalty=2.5,
#       length_penalty=10.0,
#       early_stopping=True
#   )

#   # decode and join prediction
#   preds = [
#    tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
#    for gen_id in generated_ids
#   ]
  
#   return ''.join(preds)

def summarizer(text, summary_max_length=150):
  # encoding text
  text_encoding = tokenizer(
      text,
      max_length=512,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors='pt'
  )

  # get predictions as ids
  generated_ids = trained_model.model.generate(
      input_ids=text_encoding['input_ids'],
      attention_mask=text_encoding['attention_mask'],
      max_length=summary_max_length,
      repetition_penalty=2.5,
      length_penalty=0.5,
      do_sample=True, 
      top_p=0.2, 
      top_k=0
  )

  # decode and join prediction
  preds = [
   tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
   for gen_id in generated_ids
  ]
  
  return ''.join(preds)

## Load, process, split and resave the IMDB Train, Validation and Test datasets as CSV files

In [54]:
# down load the Train dataset as an iterator
data_iter = IMDB(split='train')
labels = []
texts = []

for i, (label, text) in enumerate(data_iter):
  labels.append(label)
  texts.append(text)

df = pd.DataFrame.from_dict({'label': labels, 'text': texts})
df = df.dropna()
df.head()
df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'train.csv', columns=['label', 'text'])

In [55]:
# down load the Test dataset as an iterator
data_iter = IMDB(split='test')
labels = []
texts = []

for i, (label, text) in enumerate(data_iter):
  labels.append(label)
  texts.append(text)

## create a Pandas DataFrame of data


In [56]:
df = pd.DataFrame.from_dict({'label': labels, 'text': texts})
df = df.dropna()
df.head()

Unnamed: 0,label,text
0,neg,I love sci-fi and am willing to put up with a ...
1,neg,"Worth the entertainment value of a rental, esp..."
2,neg,its a totally average film with a few semi-alr...
3,neg,STAR RATING: ***** Saturday Night **** Friday ...
4,neg,"First off let me say, If you haven't enjoyed a..."


In [65]:
i = 500
summaries_lengths = [120, 110, 100, 90, 80, 70, 60, 50, 40, 30, 20]
negative_df = df[df['label']=='neg']
positive_df = df[df['label']=='pos']
test_df = negative_df[:i]
test_df = test_df.append(positive_df[:i], ignore_index=True)

for l in summaries_lengths:
    column_name = 'summary-' + str(l)
    test_df[column_name] = ['empty'] * len(test_df)
print(f'Size of dataframe is: {len(test_df)}')

if not Path(SAVE_DATASET_PATH + 'test.csv').is_file():
    test_df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'test.csv')

Size of dataframe is: 1000


In [58]:
columns = ['label', 'text']
valid_df = negative_df[i:]
valid_df = valid_df.append(positive_df[i:], ignore_index=True)
valid_df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'valid.csv', columns=columns)

## Summarizer Pipeline - get summary and save as a pd.DataFrame

In [68]:
columns = ['label', 
           'text', 
           'summary-120', 
           'summary-110', 
           'summary-100', 
           'summary-90', 
           'summary-80', 
           'summary-70', 
           'summary-60', 
           'summary-50', 
           'summary-40', 
           'summary-30', 
           'summary-20']
test_df = pd.read_csv(SAVE_DATASET_PATH + 'test.csv', usecols=columns)
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df.head()

Unnamed: 0,label,text,summary-120,summary-110,summary-100,summary-90,summary-80,summary-70,summary-60,summary-50,summary-40,summary-30,summary-20
0,neg,"If you are in search of a masochistic thrill, ...",empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty
1,neg,I can not believe I even wasted a NetFlix rent...,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty
2,pos,"As a big Dostoyevsky fan, I had always been di...",empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty
3,pos,"One of the best,Lackawanna Blues<br /><br />Gr...",empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty
4,neg,The first 2/3 of this film wasn't that dissimi...,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty


In [69]:
columns = list(test_df.columns.values)
with torch.no_grad():
    for i in tqdm(range(len(test_df))):
        row = test_df.iloc[i]
        for j, max_length in enumerate(summaries_lengths):
            if row[columns[j + 2]] == 'empty':
                text = row['text']
                summary = summarizer(text, summary_max_length=max_length)
                test_df.iloc[i, j + 2] = summary
        if i % 2 == 0:
            test_df.to_csv(path_or_buf=SAVE_DATASET_PATH + 'test.csv', columns=columns)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

KeyboardInterrupt: ignored