<a href="https://colab.research.google.com/github/VishaLPatiL9029/100-days-of-machine-learning/blob/main/Text_Summarization_using_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'bbc-news-summary:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F24984%2F32267%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240203%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240203T095107Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D07fdeaf9e171a21f335f272843af8890908de55d94d9e3dcaa723b298ca4ed03fb43669a65136104fa5e19b6cef86215e2184d61d1fa815b93cbd59728bf595fd0fff2cafc387415d268b346db859b5f4553818ad241f97f98f3e79dcdd3de39b5459c99850ae0457a7725becbd26eb23fd156ad76a9ac95168effb48dd310a655958ead07830f9e305583a206d55db1faa4e2e7e153455ee07ad758c7c544aea17eb2a7bc86233b0d680abc698275717245fe72fe247c96fefd6c319ae9cbfc888d9b9ed9b29fc149eec08f706b326e6c611a02264850c6d7e638fc8ff922c971e6e6c2f033fc257328b98a29e7278a9e72ac845602689951adec0a8f119251'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import os
import time
import glob
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
os.listdir('/kaggle/input/bbc-news-summary/BBC News Summary/Summaries')

In [None]:
articles_path = '../input/bbc-news-summary/BBC News Summary/News Articles'
summaries_path = '../input/bbc-news-summary/BBC News Summary/Summaries'
categories_list = ['politics', 'sport', 'tech', 'entertainment', 'business']

In [None]:
def read_files_from_folders(articles_path, summaries_path, categories_list=['tech', 'sport'], encoding = "ISO-8859-1"):
    articles = []
    summaries = []
    categories = []
    for category in categories_list:
        article_paths = glob.glob(os.path.join(articles_path, category, '*.txt'), recursive=True)
        summary_paths = glob.glob(os.path.join(summaries_path, category, '*.txt'), recursive=True)

        if len(article_paths) != len(summary_paths):
            print('number of files is not equal')
            return
        for i in range(len(article_paths)):
            categories.append(category)
            with open(article_paths[i], mode='r', encoding = encoding) as file:
                articles.append(file.read())

            with open(summary_paths[i], mode='r', encoding = encoding) as file:
                summaries.append(file.read())
    return articles, summaries, categories

In [None]:
articles, summaries, categories = read_files_from_folders(articles_path, summaries_path, categories_list)

In [None]:
df = pd.DataFrame({'articles':articles, 'summaries':summaries, 'categories':categories})

In [None]:
df

In [None]:
category_sizes = df.groupby('categories').size()
sns.barplot(x=category_sizes.index, y=category_sizes)
plt.show()

In [None]:
#getting the length of each article and summary for analysis
df['articles_length'] = df['articles'].apply(lambda x: len(x.split()))
df['summaries_length'] = df['summaries'].apply(lambda x: len(x.split()))

In [None]:
df

In [None]:
category_length = df.groupby('categories',0).agg({'articles_length': 'mean', 'summaries_length': 'mean'})


In [None]:
category_length

In [None]:
df_m = pd.melt(category_length, ignore_index=False)
plt.figure(figsize=(8,6))
sns.barplot(x=df_m.index, y="value", hue='variable', data=df_m)
plt.show()

In [None]:
df_m

In [None]:
category_length

In [None]:
pd.melt(category_length, ignore_index = False).groupby('variable').mean()

In [None]:
!nvidia-smi

In [None]:
# !pip install --upgrade pytorch_lightning
# !pip install --upgrade torchtext


In [None]:
# !pip install --quiet transformers
# !pip install --quiet pytorch-lightning
#!pip install --upgrade torchvision
#!pip install --upgrade torch
#!pip install --upgrade torchaudio



In [None]:
#!pip install torchaudio


In [None]:
import plotly.express as px

tmp_df = pd.DataFrame({'category': df['categories'].value_counts().index, 'files_count': df['categories'].value_counts().values})
fig = px.pie(tmp_df, values='files_count', names='category', title=f"Categories Ratio")
fig.show()


In [None]:
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from torchtext import data


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    AutoTokenizer,
    T5TokenizerFast as T5Tokenizer
)

from tqdm.auto import tqdm
from pylab import rcParams
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(style='whitegrid', palette='muted', font_scale = 1.2)
rcParams['figure.figsize']=16,10

In [None]:
df

In [None]:
df = df[['articles', 'summaries']]



In [None]:
df['articles'] = df['articles'].str.encode('ascii', 'ignore').str.decode('ascii')
df['summaries'] = df['summaries'].str.encode('ascii', 'ignore').str.decode('ascii')

In [None]:
df = df.dropna()


In [None]:
 train_df, test_df = train_test_split(df, test_size=0.1)

In [None]:
test_df

In [None]:
train_df

In [None]:
class NewsSummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 128
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index:int):
        data_row = self.data.iloc[index]
        text = data_row['articles']
        text_encoding = tokenizer(
            text,
            max_length = self.text_max_token_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            add_special_tokens = True,
            return_tensors = 'pt'
        )
        summary_encoding = tokenizer(
            data_row['summaries'],
            max_length = self.summary_max_token_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            add_special_tokens = True,
            return_tensors = 'pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            text = text,
            summary = data_row['summaries'],
            text_input_ids = text_encoding['input_ids'].flatten(),
            text_attention_mask = text_encoding['attention_mask'].flatten(),
            labels = labels.flatten(),
            labels_attention_mask = summary_encoding['attention_mask'].flatten()
        )

In [None]:
class NewsSummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 128
    ):
        super().__init__()
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.test_df = test_df
        self.batch_size = batch_size
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        self.train_dataset = NewsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.test_dataset = NewsSummaryDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size = self.batch_size,
            shuffle = True,
            num_workers = 4
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = self.batch_size,
            shuffle = True,
            num_workers = 4
        )

In [None]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
text_token_counts, summary_token_counts = [], []

for _, row in train_df.iterrows():
    text_token_count = len(tokenizer.encode(row['articles']))
    text_token_counts.append(text_token_count)

    summary_token_count = len(tokenizer.encode(row['summaries']))
    summary_token_counts.append(summary_token_count)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
sns.histplot(text_token_counts, ax = ax1)
ax1.set_title('full text token counts')
sns.histplot(summary_token_counts, ax = ax2)

In [None]:
N_EPOCHS = 3
BATCH_SIZE = 8

data_module = NewsSummaryDataModule(train_df, test_df, tokenizer)

In [None]:
class NewsSummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask = attention_mask,
            labels = labels,
            decoder_attention_mask = decoder_attention_mask
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, output = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, output = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, output = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [None]:
model = NewsSummaryModel()



In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

logger = TensorBoardLogger("lightning_logs", name='news-summary')

trainer = pl.Trainer(
    logger = logger,
    callbacks = [checkpoint_callback],
    max_epochs=N_EPOCHS,
    accelerator="auto",
)

torch.cuda.empty_cache()

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = NewsSummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

trained_model.freeze()


In [None]:
import pickle

with open('trained_model.pickle', 'wb') as model_file:
    pickle.dump(trained_model, model_file)
# Load the trained model using pickle
with open('trained_model.pickle', 'rb') as model_file:
    loaded_model = pickle.load(model_file)




In [None]:
def summarizeText(text):
    text_encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        reurn_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = trained_model.model.generate(
        input_ids = text_encoding['input_ids'],
        attention_mask = text_encoding['attention_mask'],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]

    return "".join(preds)

In [None]:
def summarizeText(text, trained_model, tokenizer):
    text_encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = trained_model.model.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        decoder_start_token_id=trained_model.model.config.pad_token_id
    )

    # Decode the generated summary
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return generated_summary


In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Assuming you have trained_model and tokenizer initialized
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to summarize text
def summarize_text(text, model, tokenizer):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=150,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True,
        no_repeat_ngram_size=2
    ).to(device)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

sample_row = test_df.iloc[0]
article_text = sample_row['articles']
generated_summary = summarize_text(article_text, trained_model.model, tokenizer)

print("Original Text:")
print(article_text)

print("\nGenerated Summary:")
print(generated_summary)
