In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import numpy as np
import nltk
import evaluate
import torch, os


# Constants
MODEL_NAME = "google-t5/t5-small"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 8

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Load CSVs

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

train_df = train_df.dropna(subset=["text", "title"])
val_df = val_df.dropna(subset=["text", "title"])
test_df = test_df.dropna(subset=["text"])

# Huggingface Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


def preprocess_function(examples):
    inputs = examples["text"]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets (only for train/val with titles)
    if "title" in examples:
        labels = tokenizer(
            examples["title"],
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True
        )
        model_inputs["labels"] = labels["input_ids"]

    return model_inputs



# Tokenize
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=100,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



In [None]:
def generate_titles(dataset, beam=False):
    device = next(model.parameters()).device
    inputs = tokenizer(
        dataset["text"].tolist(),
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT_LENGTH
    ).to(device)
    input_ids = inputs["input_ids"]

    outputs = model.generate(
        input_ids=input_ids,
        max_length=MAX_TARGET_LENGTH,
        num_beams=5 if beam else 1,
        early_stopping=True
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Add predictions
test_df["greedy_preds"] = generate_titles(test_df, beam=False)
print("Beam search started..")
test_df["beam_preds"] = generate_titles(test_df, beam=True)


In [None]:
rouge = evaluate.load("rouge")

def compute_rouge(preds, refs):
    return rouge.compute(predictions=preds, references=refs)

# Example:
if "title" in test_df.columns:
    greedy_scores = compute_rouge(test_df["greedy_preds"].tolist(), test_df["title"].tolist())
    beam_scores = compute_rouge(test_df["beam_preds"].tolist(), test_df["title"].tolist())

    print("Greedy Decoding ROUGE:", greedy_scores)
    print("Beam Search ROUGE:", beam_scores)
