# Baseline For Testing

In [None]:
# TO RUN THIS NOTEBOOK, YOU NEED TO HAVE THE ipykernel PACKAGE INSTALLED
# YOU CAN INSTALL IT BY RUNNING `pip install ipykernel`
# OR BY UNCOMMENTING THE LINE BELOW AND RUNNING THE CELL
# You also need to have the requirements.txt installed
# !pip install ipykernel
# !pip install -r requirements.txt

In [None]:
from operator import itemgetter
import pandas as pd
from rouge_score import rouge_scorer

from typing import List, Tuple, Dict

import torch
from tqdm import tqdm

from datasets.dataset_dict import DatasetDict
from datasets import Dataset


from transformers import (
    AutoTokenizer,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)


# import torch.nn as nn

# import numpy as np

In [None]:
TRAIN_PATH = "../data/train.csv"
VALIDATION_PATH = "../data/validation.csv"
TEST_PATH = "../data/test_text.csv"

In [None]:
# Import the data

train_df = pd.read_csv(TRAIN_PATH)
validation_df = pd.read_csv(VALIDATION_PATH)

# The Test data corresponds to the file for submission on the Kaggle Dataset for which the labels are not available
test_df = pd.read_csv(TEST_PATH)

In [None]:
print("Train data shape: ", train_df.shape)
print(train_df.head())

print("\n")

print("Validation data shape: ", validation_df.shape)
print(validation_df.head())

## Baseline Functions

The future functions for other implementations should keep the same input/output format for ease of use

In [None]:
# Baseline From the dataset


# Function that generates summaries using LEAD-N
def lead_summary(text: pd.Series) -> List[Tuple[int, str]]:
    """Generate summaries using the LEAD-N method

    ## Input :
    - text : pd.Series : The text data for which the summaries are to be generated (the text column from the dataset)

    ## Output :
    - summaries : List[Tuple[int, str]] : A list of Tuples containing the index of the text and the summary generated using the LEAD-N method
    """
    summaries = []
    for idx, row in text.items():
        sentences = row.split(".")
        summaries.append((idx, sentences[0] + "."))
    return summaries


# Function that generates summaries using EXT-ORACLE
def ext_oracle_summary(
    text: pd.Series,
    titles: pd.Series,
    scorer: rouge_scorer.RougeScorer,
) -> List[Tuple[int, str]]:
    """Generate summaries using the EXT-ORACLE method

    ## Input :
    - text : pd.Series : The text data for which the summaries are to be generated (the text column from the dataset)
    - titles : pd.Series : The titles of the text data (the titles column from the dataset)
    - scorer : rouge_scorer.RougeScorer : The Rouge Scorer object

    ## Output :
    - summaries : List[Tuple[int, str]] : A list of Tuples containing the index of the text and the summary generated using the EXT-ORACLE method
    """
    summaries = []
    for idx, row in text.items():
        sentences = row.split(".")
        reference = titles.iloc[idx]  # type: ignore
        rs = [scorer.score(sentence, reference)["rougeL"][2] for sentence in sentences]
        index, _ = max(enumerate(rs), key=itemgetter(1))
        summaries.append((idx, sentences[index]))
    return summaries

In [None]:
# Test the functions on the validation data

lead_summary_validation = lead_summary(validation_df["text"])

In [None]:
for idx, summary in lead_summary_validation[:5]:
    print("Lead Summary: ", summary)
    print("Reference Summary: ", validation_df["titles"].iloc[idx])  # type: ignore
    print("\n")

In [None]:
ext_oracle_summary_validation = ext_oracle_summary(
    validation_df["text"],
    validation_df["titles"],
    rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True),
)

In [None]:
for idx, summary in ext_oracle_summary_validation[:5]:
    print("EXT-ORACLE Summary: ", summary)
    print("Reference Summary: ", validation_df["titles"].iloc[idx])  # type: ignore
    print("\n")

In [None]:
def average_rouge_score(
    summaries: List[Tuple[int, str]],
    titles: pd.Series,
    scorer: rouge_scorer.RougeScorer,
):
    """Calculate the average rouge score for the summaries generated

    ## Input :
    - summaries : [(int, str)...] : A list of Tuples containing the index of the text and the summary generated
    - text : pd.Series : The text data for which the summaries are to be generated (the text column from the dataset)
    - scorer : rouge_scorer.RougeScorer : The Rouge Scorer object

    ## Output :
    - average_rouge : float : The average rouge score for the summaries generated
    """
    rouge_scores = []
    for idx, summary in summaries:
        reference = titles.iloc[idx]  # type: ignore
        rouge_scores.append(scorer.score(summary, reference)["rougeL"][2])
    return sum(rouge_scores) / len(rouge_scores)

## New Approach

### T5

In [None]:
model_checkpoint_list = ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]
model_checkpoint = model_checkpoint_list[0]

In [None]:
# This shoud load the T5TokenizerFast from the transformers library
t5_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# t5_tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)

# otherwise we can use thisone and compare the results
# t5_tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

In [None]:
print(t5_tokenizer)
print(type(t5_tokenizer))

In [None]:
# This should load the T5ForConditionalGeneration model from the transformers library
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
print(model)
print(type(model))

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../outputs//{model_name}-finetuned",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    num_train_epochs=1,
    eval_steps=2,
    save_steps=2,
    warmup_steps=1,
    # overwrite_output_dir=True,
    save_total_limit=2,
)

In [None]:
def preprocess_text(
    text: str,
    title: str,
    t5_tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
    prefix: str = "summarize: ",
    max_input_length: int = 1024,
    max_target_length: int = 64,
) -> Dict[str, torch.Tensor]:
    """Preprocess the text data

    ## Input :
    - text : str : The text data to be preprocessed
    - title : str : The title of the text data, The Target
    - prefix : str : The prefix to be added to the text data as T5 model can be used for translation as well
    - max_input_length : int : The maximum length of the input text
    - max_target_length : int : The maximum length of the target text
    - tokenizer : t5_tokenizer : The tokenizer object

    ## Output :
    - model_inputs : Dict[str, Union[torch.Tensor, None]] : The model inputs
    """
    inputs = t5_tokenizer(
        f"{prefix} {text}",
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
    )

    targets = t5_tokenizer(
        title,
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
    )

    model_inputs = {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids,
    }

    return model_inputs

In [None]:
def preprocess_from_df(df: pd.DataFrame):
    dataframe_list = []
    for i in range(len(df)):
        dataframe_list.append(
            preprocess_text(df["text"].iloc[i], df["titles"].iloc[i], t5_tokenizer)
        )

    return Dataset.from_list(dataframe_list)

In [None]:
# Construct metric
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = t5_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_scores = [
        rouge.score(pred, label)["rougeL"].fmeasure
        for pred, label in zip(decoded_preds, decoded_labels)
    ]

    return {"rougeL_fmeasure": sum(rouge_scores) / len(rouge_scores)}

### Running

In [None]:
# Construct the DataCollector
data_collator = DataCollatorForSeq2Seq(t5_tokenizer, model=model)

In [None]:
train_dataset = preprocess_from_df(train_df)
validation_dataset = preprocess_from_df(validation_df)

In [None]:
total_dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})
total_dataset

In [None]:
# Construct the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=total_dataset["train"],
    eval_dataset=total_dataset["validation"],
)

In [None]:
trainer.train()

### Evaluation

In [None]:
def t5_summary(
    text: pd.Series, t5_tokenizer: PreTrainedTokenizer, model: AutoModelForSeq2SeqLM
):
    """Generate summaries using the T5 model

    Using the standard representation defined in baseline part of notebook

    ## Input :
    - text : pd.Series : The text data for which the summaries are to be generated (the text column from the dataset)
    - t5_tokenizer : PreTrainedTokenizer : The tokenizer object
    - model : AutoModelForSeq2SeqLM : The model object

    ## Output :
    - summaries : List[Tuple[int, str]] : A list of Tuples containing the index of the text and the summary generated using the T5 model
    """
    summaries = []
    for idx, row in tqdm(text.items()):
        input_text = t5_tokenizer(row, return_tensors="pt").input_ids.to(model.device)
        output = model.generate(
            input_text,
            max_length=64,
            early_stopping=True,
            num_return_sequences=1,
        )
        summaries.append(
            (idx, t5_tokenizer.decode(output[0], skip_special_tokens=True))
        )
    return summaries

In [None]:
# load from pretrained

# model_name = model_checkpoint.split("/")[-1]
model = AutoModelForSeq2SeqLM.from_pretrained(
    f"../outputs/{model_name}-finetuned/checkpoint-2676"
)

In [None]:
# Run title generation for submission
t5_summary_kaggle = t5_summary(test_df["text"], t5_tokenizer, model)

In [None]:
t5_summary_kaggle_df = pd.DataFrame(t5_summary_kaggle, columns=["ID", "titles"])
t5_summary_kaggle_df.to_csv("../outputs/submissions//t5_summary_kaggle.csv", index=False)

### Testing

In [None]:
print(preprocess_text("This is a test", "This is a test", t5_tokenizer))

In [None]:
# Check Encoding/Decoding size embeddings
test_text = train_df["text"].iloc[0]
test_title = train_df["titles"].iloc[0]

print(test_text)
print(test_title)

preprocess_test = preprocess_text(test_text, test_title, t5_tokenizer)

print(preprocess_test.keys())

decoded_text = t5_tokenizer.decode(preprocess_test["input_ids"])
decoded_title = t5_tokenizer.decode(preprocess_test["labels"])

print(decoded_text)
print(decoded_title)

In [None]:
test_text = train_df["text"].iloc[1]
test_title = train_df["titles"].iloc[1]

input_text = t5_tokenizer(test_text, return_tensors="pt").input_ids.to(model.device)

output = model.generate(
    input_text,
    max_length=64,
    early_stopping=True,
    num_return_sequences=1,
)
print(output)
print(t5_tokenizer.decode(output[0], skip_special_tokens=True))
print(test_title)

print(test_text)