In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install --upgrade torch==2.5.0 torchvision torchaudio \
    pytorch-lightning transformers==4.41.0 scikit-learn pandas==2.2.2 tqdm --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.5/906.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m118.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import pandas as pd
print(f"PyTorch version: {torch.__version__}")
print(f"Pandas version: {pd.__version__}")


PyTorch version: 2.5.0+cu124
Pandas version: 2.2.2


In [4]:
import torch
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os
import pytorch_lightning as pl
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn import metrics


In [5]:
import pandas as pd

# Set random seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# # Placeholder function for dataset loading (to be completed later)
# def get_dataset(tokenizer, type_path, hparams):
#     """Load the dataset from the correct CSV file."""
#     if type_path == "train":
#         return SarcasmDataset(tokenizer, "Train_Dataset.csv", max_len=hparams.max_seq_length)
#     elif type_path in ["test", "val"]:
#         return SarcasmDataset(tokenizer, "Test_Dataset.csv", max_len=hparams.max_seq_length)

def get_dataset(tokenizer, type_path, hparams):
    file_path = f"/content/{type_path}_Dataset.csv"
    print(f"Loading dataset from: {file_path}")

    df = pd.read_csv(file_path)[["tweet", "sarcastic"]]
    print(df.head())

    dataset = SarcasmDataset(tokenizer, df, hparams.max_len)
    return dataset

In [6]:
set_seed(42)
print("Random seed set to 42")


Random seed set to 42


In [7]:
from torch.utils.data import Dataset

# Define SarcasmDataset class for T5 training
class SarcasmDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512):
        self.max_len = max_len
        self.df = df
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []


        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        return {
            "source_ids": source_ids,
            "source_mask": src_mask,
            "target_ids": target_ids,
            "target_mask": target_mask
        }

    def _build(self):
        for index, row in self.df.iterrows():
            line = row['tweet'].strip()
            target = "positive </s>" if row['sarcastic'] == 1 else "negative </s>"

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [line], max_length=self.max_len, padding="max_length", return_tensors="pt"
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=2, padding="max_length", return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)





In [8]:
# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# train_dataset = get_dataset(tokenizer, "train", hparams)
# test_dataset = get_dataset(tokenizer, "test", hparams)

# print(f"Train dataset size: {len(train_dataset)}")
# print(f"Test dataset size: {len(test_dataset)}")
# print(train_dataset[0]) 


In [9]:
# # Load tokenizer
# tokenizer = T5Tokenizer.from_pretrained("t5-base")

# # Create dummy data for testing
# import pandas as pd
# data = {"tweet": ["This is amazing!", "Oh great, another Monday..."], "sarcastic": [0, 1]}
# df = pd.DataFrame(data)

# # Initialize dataset
# dataset = SarcasmDataset(tokenizer, df, max_len=512)

# # Check dataset output
# print(dataset[0])  # Print first data sample


In [10]:
import torch
import pytorch_lightning as pl
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score

# Define T5FineTuner class
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()

        # Fix: Convert Namespace to dictionary to avoid KeyError
        self.save_hyperparameters(vars(hparams))

        # Load pre-trained T5 model
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,  # Ensure compatibility with latest transformers
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100  # Ignore padding token

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs.loss  # Extract loss
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.hparams["weight_decay"]},
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams["learning_rate"], eps=self.hparams["adam_epsilon"])
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams["warmup_steps"], num_training_steps=self.hparams["num_train_epochs"]
        )
        return [optimizer], [scheduler]


In [20]:
from argparse import Namespace

# Define hyperparameters for training
hparams = Namespace(
    model_name_or_path="t5-base",
    tokenizer_name_or_path="t5-base",
    max_len=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=64,
    eval_batch_size=128,
    num_train_epochs=2,
    gradient_accumulation_steps=4,
    n_gpu=1,
)

# Initialize model
model = T5FineTuner(hparams)
print("T5FineTuner initialized successfully!")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5FineTuner initialized successfully!


In [21]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

train_dataset = get_dataset(tokenizer, "Train", hparams)
test_dataset = get_dataset(tokenizer, "Test", hparams)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(train_dataset[0])  


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading dataset from: /content/Train_Dataset.csv
                                               tweet  sarcastic
0  The only thing I got from college is a caffein...          1
1  I love it when professors draw a big question ...          1
2  Remember the hundred emails from companies whe...          1
3  Today my pop-pop told me I was not “forced” to...          1
4  @VolphanCarol @littlewhitty @mysticalmanatee I...          1
Loading dataset from: /content/Test_Dataset.csv
                                               tweet  sarcastic
0  Size on the the Toulouse team, That pack is mo...          0
1                                           Pinball!          0
2  So the Scottish Government want people to get ...          1
3  villainous pro tip : change the device name on...          0
4                    I would date any of these men 🥺          0




Train dataset size: 6934
Test dataset size: 1400
{'source_ids': tensor([   37,   163,   589,    27,   530,    45,  1900,    19,     3,     9,
        24355,  9189,     1,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,


In [22]:
from torch.utils.data import DataLoader

# # Load tokenizer
# tokenizer = T5Tokenizer.from_pretrained("t5-base")

# # Load dataset (replace with your real dataset later)
# import pandas as pd
# data = {"tweet": ["I love Mondays!", "Oh great, another meeting..."], "sarcastic": [0, 1]}
# df = pd.DataFrame(data)

# # Create dataset
# train_dataset = SarcasmDataset(tokenizer, df, max_len=512)
# val_dataset = SarcasmDataset(tokenizer, df, max_len=512)  # For testing, use the same dataset

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=hparams.train_batch_size, shuffle=True)
val_dataloader = DataLoader(test_dataset, batch_size=hparams.eval_batch_size)


In [23]:
import time
import torch
from torchmetrics import F1Score

def compute_f1_score(model, dataloader, tokenizer):
    start_time = time.time()  

    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            batch_start_time = time.time()  

            input_ids = batch["source_ids"].to(model.device)
            attention_mask = batch["source_mask"].to(model.device)

            with torch.autocast("cuda"):  
                outputs = model.model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2, use_cache=True)

            preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
            targets = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]

            predictions.extend(preds)
            references.extend(targets)

            batch_end_time = time.time()  
            print(f"Processed {i+1}/{len(dataloader)} batches in {batch_end_time - batch_start_time:.2f} seconds")

    f1 = f1_score(references, predictions, average="weighted") 
    end_time = time.time()  

    print(f"Total F1-score computation time: {end_time - start_time:.2f} seconds")
    return f1


In [25]:
from pytorch_lightning.callbacks import ModelCheckpoint

# Set model checkpoint to save best models
checkpoint_callback = ModelCheckpoint(
    dirpath="./t5_isarcasm",
    filename="checkpoint-{epoch:02d}-{val_loss:.2f}",
    monitor="val_loss",
    mode="min",
    save_top_k=3,  # Keep top 3 models
)

# Define Trainer
trainer = pl.Trainer(
    max_epochs=hparams.num_train_epochs,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else "auto",
    precision=16 if torch.cuda.is_available() else 32,
    gradient_clip_val=hparams.gradient_accumulation_steps,
    callbacks=[checkpoint_callback],
)


# Start training
trainer.fit(model, train_dataloader, val_dataloader)


/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /content/t5_isarcasm exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M  | eval
---

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [26]:
f1 = compute_f1_score(model, val_dataloader, tokenizer)
print(f"Final F1 Score on Test Set: {f1:.4f}")


Processed 1/11 batches in 46.41 seconds
Processed 2/11 batches in 47.09 seconds
Processed 3/11 batches in 46.19 seconds
Processed 4/11 batches in 46.11 seconds
Processed 5/11 batches in 46.52 seconds
Processed 6/11 batches in 46.66 seconds
Processed 7/11 batches in 46.51 seconds
Processed 8/11 batches in 46.42 seconds
Processed 9/11 batches in 46.64 seconds
Processed 10/11 batches in 46.58 seconds
Processed 11/11 batches in 43.31 seconds
Total F1-score computation time: 508.48 seconds
Final F1 Score on Test Set: 0.8044
