In [1]:
# System Library
import os

# Data Wrangling Libraries
import pandas as pd
import numpy as np
import json
import gc
import textwrap
from termcolor import colored

# Machine Learning Libraries
import torch 
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer
from tqdm.auto import tqdm


# Graph Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize']=16,10


KeyboardInterrupt: 

In [None]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        title_max_token_len: int = 512,
        content_max_token_len: int = 128
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.title_max_token_len = title_max_token_len
        self.content_max_token_len = content_max_token_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        title = data_row['title']

        title_encoding = self.tokenizer(
            title,
            max_length = self.title_max_token_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            add_special_tokens = True,
            return_tensors = 'pt'
        )

        content = data_row['content']

        content_encoding = self.tokenizer(
            content,
            max_length = self.content_max_token_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            add_special_tokens = True,
            return_tensors = 'pt'
        )

        labels = content_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            title = title,
            content = content,
            title_input_ids = title_encoding['input_ids'].flatten(),
            text_attention_mask = title_encoding['attention_mask'].flatten(),
            labels = labels.flatten(),
            labels_attention_mask = content_encoding['attention_mask'].flatten()
        )

class SummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        #X_train: pd.DataFrame,
        #y_train: pd.DataFrame,
        #X_test: pd.DataFrame,
        #y_test: pd.DataFrame,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        title_max_token_len: int = 512,
        content_max_token_len: int = 128
    ):
        super().__init__()
        #self.X_train = X_train
        #self.y_train = y_train
        #self.X_test = X_test
        #self.y_test = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.title_max_token_len = title_max_token_len
        self.content_max_token_len = content_max_token_len
        #self.train_df = pd.DataFrame({self.X_train.name: self.X_train, self.y_train.name: self.y_train})
        #self.test_df = pd.DataFrame({self.X_test.name: self.X_test, self.y_test.name: self.y_test})
        self.train_df = train_df
        self.test_df = test_df

    def setup(self, stage=None):

        self.train_dataset = SummaryDataset(
            self.train_df,
            self.tokenizer,
            self.title_max_token_len,
            self.content_max_token_len
        )

        self.test_dataset = SummaryDataset(
            self.test_df,
            self.tokenizer,
            self.title_max_token_len,
            self.content_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def validation_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

class SummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("validation_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.001)


In [35]:
%load_ext tensorboard
%tensorboard --logdir ./lighting_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 28427), started 13:41:15 ago. (Use '!kill 28427' to kill it.)

In [3]:
from summary_moduler import SummaryDataset, SummaryModel, SummaryDataModule
if __name__ == "__main__":
    with open('../Dataset/bbc-news-data.csv', 'r') as f:
        df_header = f.readline().split()
    df = pd.read_csv('../Dataset/bbc-news-data.csv', names=df_header, sep='\t', skiprows=1)
    train_df, test_df = train_test_split(df, test_size=0.2)
    MODEL_NAME = 't5-base'
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

    title_token_counts, content_token_counts = [], []

    for _, row in train_df.iterrows():
        title_token_count = len(tokenizer.encode(row['title']))
        title_token_counts.append(title_token_count)

        content_token_count = len(tokenizer.encode(row['content']))
        content_token_counts.append(content_token_count)
    
    
    EPOCHS = 3
    BATCH_SIZE = 8

    #data_module = SummaryDataModule(X_train=X_train, y_train=y_train, X_test=X_test, 
    #                        y_test=y_test, tokenizer=tokenizer)

    data_module = SummaryDataModule(train_df=train_df, test_df=test_df, tokenizer=tokenizer)
    model = SummaryModel()
    checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename='best-checkpoint',
        save_top_k=1,
        verbose=True,
        monitor='validation_loss',
        mode='min'
    )

    logger = TensorBoardLogger("lightning_logs", name='summary')

    trainer = pl.Trainer(
        logger=logger,
        callbacks=[checkpoint_callback, TQDMProgressBar(refresh_rate=10)],
        max_epochs=EPOCHS,
        gpus=0,
        enable_progress_bar=True,
    )

    trainer.fit(model=model, train_dataloaders=data_module)
    

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Token indices sequence length is longer than the specified maximum sequence length for this model (785 > 512). Running this sequence through the model will result in indexing errors
  rank_zero_de

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(
Missing logger folder: lightning_logs/summary

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-11-29 20:12:21.449309: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-11-29 20:12:36.699409: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Epoch 0, global step 223: 'validation_loss' was not in top 1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-11-29 22:30:08.444815: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-11-29 22:30:22.403213: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Epoch 1, global step 446: 'validation_loss' was not in top 1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-11-30 00:35:48.129763: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-11-30 00:36:04.545591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Epoch 2, global step 669: 'validation_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=3` reached.


In [9]:
trainer.save_checkpoint('./checkpoints/model.ckpt')


In [4]:
from summary_moduler import SummaryDataset, SummaryModel, SummaryDataModule
trained_model = SummaryModel.load_from_checkpoint('./checkpoints/model.ckpt')
trained_model.freeze()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
def summarizeText(text):
    text_encoding = tokenizer(
        text,
        max_length = 512,
        padding = 'max_length',
        truncation = True,
        add_special_tokens = True,
        return_tensors = 'pt'
    )
    generated_ids = trained_model.model.generate(
        input_ids = text_encoding['input_ids'],
        attention_mask = text_encoding['attention_mask'],
        max_length = 150,
        num_beams = 2,
        repetition_penalty = 2.5,
        length_penalty = 2.0,
        early_stopping = True
        
    )

    preds = [
        tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    ]
    return "".join(preds)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
if __name__ == "__main__":
    with open('../Dataset/bbc-news-data.csv', 'r') as f:
        df_header = f.readline().split()
    df = pd.read_csv('../Dataset/bbc-news-data.csv', names=df_header, sep='\t', skiprows=1)
    train_df, test_df = train_test_split(df, test_size=0.2)

In [12]:
sample_row = test_df.iloc[0]
text = sample_row['title']
model_summarization = summarizeText(text=text)
print(text)

Gardener battles to narrow win


In [14]:
test_df['title'].iloc[0]

'Gardener battles to narrow win'

In [15]:
sample_row['content']

' Jason Gardener fought all the way to the line to narrowly claim the men\'s 60m title at the Norwich Union Indoor trials and AAAs Championships.  The world 60m champion got off to a rolling start and had to dig deep to dip ahead of Mark Findlay and Darren Chin, who both set personal bests. "It was a close race," admitted Gardener. "I stumbled out the blocks but my experience told through. "I still feel there\'s more life in me and I believe I can go faster." Gardener\'s performance in Sheffield could have been affected by the news, which he heard before his semi-final, that his European record had been broken Frenchman Ronald Pognon, who will be a real threat at the European Championships, set a new time of 6.45, one-hundreth of a second faster than Gardener\'s previous mark. Favourite  delivered a powerful performance to take the women\'s 60m title in 7.27 seconds. "You\'ll see me in Madrid and I feel there is a lot more to come along," said the 22-year-old. Katherine Endacott battle

In [13]:
print(model_summarization)

Tony Gardener battled to a narrow victory in Sunday's Australian Open semi-final. The world number one was out of action for the second time in his career, but managed to pull off a third-set tie-break and close the match with Andy Roddick. And Gardener said: "It was a tough race but I had to work hard to get back to winning ways. "I didn't want to give up on my dream job." Gardener will now face fellow American Chris Martin in the final of the European Indoor Championships in Melbourne on Wednesday. He


In [42]:
print(len(model_summarization))

544


In [1]:
sample_row_1 = test_df.iloc[1]
text_1 = sample_row_1['title']
model_summarization_1 = summarizeText(text=text_1)
print(text_1)
print('\n')
print(sample_row_1['content'])
print('\n')
print(model_summarization_1)

NameError: name 'test_df' is not defined

In [57]:
from transformers import AutoTokenizer, AutoModelForCausalLM

gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")

prompt = text_1
token_input_ids = gpt_tokenizer(prompt, return_tensors="pt").input_ids

# generate up to 30 tokens
outputs = gpt_model.generate(token_input_ids, do_sample=False, max_length=30)
tokened = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(tokened)
print(len(tokened[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['sermon old less, trueM Miller incepe și și working Miller an bei wiegestellt place drive E true place E anti each There Eler']
124


In [62]:
from transformers import AutoTokenizer, AutoModelForCausalLM

gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")

prompt = text_1
token_input_ids_1 = gpt_tokenizer(prompt, return_tensors="pt").input_ids

# generate up to 30 tokens
outputs_1 = gpt_model.generate(token_input_ids_1, do_sample=False, max_length=30)
gpt_tokenizer.batch_decode(outputs_1, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Row over 'police' power for CSOs\n\nThe CSO has been accused of using the power of the state to intimidate and intimidate the public"]

In [63]:
text_1

"Row over 'police' power for CSOs"