# 1. Data Preparation

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import CosineAnnealingLR
import numpy as np
combined_df = pd.read_csv('dataset/Combined-Preprocessed-Dataset.csv')
combined_df = combined_df[['text', 'label']]

combined_df = combined_df.groupby('label').apply(lambda x: x.sample(n=100, random_state=1)).reset_index(drop=True)

# 2. Tokenization

In [2]:
device = torch.device("cuda")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
input_ids = tokenizer.batch_encode_plus(combined_df['text'].tolist(), padding=True, truncation=True, return_tensors="pt").input_ids
labels = torch.tensor(combined_df['label'].tolist())

In [3]:
# 3. Dataset and DataLoader
dataset = TensorDataset(input_ids, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

batch_size = 16
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [4]:
# 4. Model Configuration
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
epoch = 5
learning_rate = 5e-4
weight_decay = 1e-3
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = CosineAnnealingLR(optimizer, T_max=epoch)




In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

train_loss = []
val_loss = []
train_accuracy = []
val_accuracy = []
train_precision = []
val_precision = []
train_recall = []
val_recall = []
train_f1 = []
val_f1 = []

for e in range(epoch):
    model.train()
    train_loss_epoch = []
    train_preds = []
    train_true = []
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        outputs = model(input_ids=input_ids, labels=input_ids)
        loss = outputs.loss
        train_loss_epoch.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # Calculate train metrics
        pred = torch.argmax(outputs.logits, dim=2)
        train_preds.extend(pred.flatten().tolist())
        train_true.extend(labels.flatten().tolist())
        
    model.eval()
    val_loss_epoch = []
    val_preds = []
    val_true = []
    for batch in val_dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=input_ids)
        loss = outputs.loss
        val_loss_epoch.append(loss.item())
        
        # Calculate validation metrics
        pred = torch.argmax(outputs.logits, dim=2)
        val_preds.extend(pred.flatten().tolist())
        val_true.extend(labels.flatten().tolist())
    
    # Calculate and store metrics
    train_loss.append(np.mean(train_loss_epoch))
    val_loss.append(np.mean(val_loss_epoch))
    train_accuracy.append(accuracy_score(train_true, train_preds))
    val_accuracy.append(accuracy_score(val_true, val_preds))
    train_precision.append(precision_score(train_true, train_preds, average='weighted'))
    val_precision.append(precision_score(val_true, val_preds, average='weighted'))
    train_recall.append(recall_score(train_true, train_preds, average='weighted'))
    val_recall.append(recall_score(val_true, val_preds, average='weighted'))
    train_f1.append(f1_score(train_true, train_preds, average='weighted'))
    val_f1.append(f1_score(val_true, val_preds, average='weighted'))


ValueError: Found input variables with inconsistent numbers of samples: [160, 81920]

In [None]:
# 9. Plotting metrics
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Train Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracy, label='Train Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Accuracy')
plt.legend()

plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
from scipy.optimize import brentq
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

# 10. Evaluation

# Final Metrics on Validation Set
final_val_preds = []  # Store your final validation predictions here
final_val_labels = []  # Store the actual labels here

# Calculate basic metrics
final_val_accuracy = accuracy_score(final_val_labels, final_val_preds)
final_val_precision = precision_score(final_val_labels, final_val_preds, average='weighted')
final_val_recall = recall_score(final_val_labels, final_val_preds, average='weighted')
final_val_f1 = f1_score(final_val_labels, final_val_preds, average='weighted')

print(f"Final Validation Accuracy: {final_val_accuracy}")
print(f"Final Validation Precision: {final_val_precision}")
print(f"Final Validation Recall: {final_val_recall}")
print(f"Final Validation F1 Score: {final_val_f1}")

# ROC and AUC
fpr, tpr, _ = roc_curve(final_val_labels, final_val_preds)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# DET Curve
fnr = 1 - tpr
eer = brentq(lambda x : 1. - x - interp1d(fpr, fnr)(x), 0., 1.)
thresh = interp1d(fpr, thresholds)(eer)

plt.figure()
plt.plot(fpr, fnr, label='EER = {:.3f}'.format(eer))
plt.scatter(eer, 1-eer, c='red')
plt.title('DET Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('False Negative Rate')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
