In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


# **Install Required Libraries**

In [None]:
!pip install --quiet pytorch_lightning
!pip install --quiet  transformers
!pip install --quiet  seaborn
!pip install --quiet  wget

# **Import Required Libraries**

In [None]:
import pytorch_lightning as pl
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer
from tqdm.auto import tqdm

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

sns.set(style='whitegrid',palette='muted',font_scale=1.2)
rcParams['figure.figsize'] = 16, 6

In [None]:
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Seed set to 42


42

In [None]:
!pip install datasets
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("cnn_dailymail",'3.0.0')
df = pd.DataFrame(dataset['train'])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
 df.columns

Index(['article', 'highlights', 'id'], dtype='object')

In [None]:
df = df[["article","highlights"]]
df.head()

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [None]:
df.columns

Index(['article', 'highlights'], dtype='object')

# **Overview of the Dataset**

In [None]:
df.columns = ["text", "summary"]
df = df.dropna()
df.head()

Unnamed: 0,text,summary
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [None]:
df.shape

(287113, 2)

In [None]:
df.iloc(0)

<pandas.core.indexing._iLocIndexer at 0x7b2ad8b93d30>

# **Train Test Split of the Data**

In [None]:
train_df, test_df = train_test_split(df,test_size=0.1)
train_df.shape,test_df.shape

((258401, 2), (28712, 2))

# **Tokenize and Process the Data for Model**

In [None]:
class NewsSummaryDataset(Dataset):
    def __init__(
      self,
      data : pd.DataFrame,
      tokennizer : T5Tokenizer,
      text_max_token_len : 512,
      summary_max_token_len : 128):

      self.tokennizer = tokennizer
      self.data = data,
      self.text_max_token_len = text_max_token_len
      self.summary_max_token_len = summary_max_token_len

    def __len__(self):
      return len(self.data)

    def __getitem__(self, index : int):
      data_row = self.data[0].iloc[index]
      text = data_row["text"]

      text_encoding = self.tokennizer(
          text,
          max_length = self.text_max_token_len,
          padding = "max_length",
          truncation = True,
          return_attention_mask = True,
          return_tensors = "pt"
      )

      summary_encoding = self.tokennizer(
          data_row["summary"],
          max_length = self.summary_max_token_len,
          padding = "max_length",
          truncation = True,
          return_attention_mask = True,
          return_tensors = "pt"   # Return PyTorch tensors
      )

      labels = summary_encoding["input_ids"]
      labels[labels==0] = -100

      return dict(
          text = text,
          summary = data_row["summary"],
          text_input_ids = text_encoding["input_ids"].flatten(),
          text_attention_mask = text_encoding["attention_mask"].flatten(),
          labels = labels.flatten(),
          labels_attention_mask = summary_encoding["attention_mask"].flatten()
      )


# **Dataset Preparation Module Using DataLoader**

In [None]:
class NewsSummaryDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df : pd.DataFrame,
        test_df : pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size : int  = 8,
        text_max_token_len : int = 512,
        summary_max_token_len :int = 128
    ):

        super().__init__()

        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    # LightningModule.setup(stage=None)
    # Called at the beginning of fit (train + validate), validate, test, or predict.

    def setup(self, stage=None):

        self.train_dataset = NewsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len)

        self.test_dataset = NewsSummaryDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle= True,
            num_workers=2
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle= False,
            num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle= False,
            num_workers=2
        )

# **Importing the Tokennizer corresponding to T5-Base**

In [None]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

#**Distribution of Sentence len : News vs Summary**

# Encoding

# **Data Module using DataLoader and Preprocess for training**

In [None]:
N_EPOCHS = 5
BATCH_SIZE = 16

data_module = NewsSummaryDataModule(train_df,test_df,tokenizer,batch_size=BATCH_SIZE)

# **Model Architecture**

In [None]:
class NewsSummaryModel(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)

    def forward(self,input_ids,attention_mask,decoder_attention_mask, labels=None):

        output = self.model(
            input_ids,
            attention_mask = attention_mask,
            labels = labels,
            decoder_attention_mask = decoder_attention_mask
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):

        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("train_loss",loss,prog_bar=True,logger=True)

        return loss


    def validation_step(self, batch, batch_idx):

        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("val_loss",loss,prog_bar=True,logger=True)

        return loss

    def test_step(self, batch, batch_idx):

        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("test_loss",loss,prog_bar=True,logger=True)

        return loss


    def configure_optimizers(self):
        return AdamW(self.parameters(),lr = 0.0001)

In [None]:
model = NewsSummaryModel()

In [None]:
##%load_ext tensorboard
##%tensorboard --logdir ./lightning_logs

In [None]:
import torch
torch.cuda.is_available()

False

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_loss",name="news-summary")



trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
    accelerator="auto",
    enable_progress_bar=True
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


# **Training**

In [None]:
trainer.fit(model,datamodule = data_module)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 1: 'val_loss' reached 0.66952 (best 0.66952), saving model to '/content/checkpoints/best-checkpoint-v1.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 2: 'val_loss' reached 0.66239 (best 0.66239), saving model to '/content/checkpoints/best-checkpoint-v1.ckpt' as top 1


# **Using the Model for Inference**

In [None]:
trained_model = NewsSummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

trained_model.freeze()

# **Function to Predict Summary**

In [None]:
def summarize_text(text):
    device = trained_model.device  # Get the device of the trained model

    text_encoding = tokenizer(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    # Move input tensors to the same device as the trained model
    text_encoding = {key: value.to(device) for key, value in text_encoding.items()}

    generated_ids = trained_model.model.generate(
        input_ids=text_encoding["input_ids"],
        attention_mask=text_encoding["attention_mask"],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    # Move generated_ids back to CPU if it was on GPU
    generated_ids = generated_ids.cpu() if device.type == 'cuda' else generated_ids

    preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for gen_id in generated_ids]

    return " ".join(preds)



# **Prediction on Test Data**

In [None]:
sample_row = test_df.iloc[0]
text = sample_row["text"]
model_summary = summarize_text(text)

In [None]:
text

In [None]:
model_summary

In [None]:
sample_row["summary"]

# **Prediction On Random News**

# *Vedanta's chairman Anil Agarwal earlier this week announced the biggest investment of ₹1.54 lakh crore for setting up the country's first-ever semiconductor chip plant in Gujarat. This led to a strong buying on stock exchanges that drove Vedanta to rise nearly 18% this week. However, on the last trading day of the current week, Vedanta shares pulled back and slipped by at least nearly 9% on BSE after the company said, the semiconductor plant project is not under their ambit but will be undertaken by Volcan Investments.On Thursday, in its regulatory filing, Vedanta said, " we reiterate that the proposed business of manufacturing semiconductors is not under Vedanta and we understand that it will be undertaken by the ultimate holding company of Vedanta, Volcan Investments."This week, in a statement, Vedanta said the company signed two Memorandum of Understanding (MoUs) with the Gujarat government to set up a semiconductor fab unit, a display fab unit, and a semiconductor assembling and testing unit in the Ahmedabad district of the state.*

In [None]:
text = "Artificial Intelligence (AI) is a multidisciplinary field of computer science that focuses on creating systems capable of performing tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, speech recognition, and language understanding. The ultimate goal of AI is to develop machines that can exhibit cognitive abilities similar to those of humans.AI encompasses two main types: Narrow or Weak AI, and General or Strong AI. Narrow AI is designed to perform a specific task, such as image recognition or language translation, while General AI aims to possess the ability to understand, learn, and apply knowledge across diverse domains, essentially mimicking human intelligence. One of the key components driving advancements in AI is machine learning. Machine learning is a subset of AI that focuses on developing algorithms and models that enable computers to learn from data and make decisions without explicit programming. Supervised learning, unsupervised learning, and reinforcement learning are common paradigms within machine learning.Deep learning, a subset of machine learning, has gained prominence for its ability to automatically learn hierarchical representations of data through artificial neural networks. Inspired by the structure and function of the human brain, deep learning has achieved remarkable success in image and speech recognition, natural language processing, and other complex tasks.Natural Language Processing (NLP) is a branch of AI that facilitates interactions between computers and human languages. It enables machines to understand, interpret, and generate human language, paving the way for applications like chatbots, sentiment analysis, and language translation. AI applications are pervasive in various industries, including healthcare, finance, education, and entertainment. In healthcare, AI is employed for medical image analysis, drug discovery, and personalized medicine. Financial institutions use AI for fraud detection, risk assessment, and algorithmic trading. Educational platforms leverage AI for personalized learning experiences, while the entertainment industry utilizes AI for content recommendation and virtual assistants. However, the rapid advancement of AI also raises ethical concerns and considerations. Questions about privacy, bias in algorithms, accountability, and the impact of AI on employment are actively debated. As AI technologies continue to evolve, it is crucial to strike a balance between innovation and addressing ethical implications to ensure responsible and beneficial AI applications."

In [None]:
summarize_text(text)

## evaluating the model's performance

In [None]:
!pip install nltk

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu

reference_summaries = test_df["summary"].tolist()  # Ground truth summaries
generated_summaries = [summarize_text(text) for text in test_df["text"]]

# Compute ROUGE scores
rouge_scores = corpus_bleu(reference_summaries, generated_summaries)
print("ROUGE Score:", rouge_scores)
