# **T5-Sentinel Model**

# 1. Train, Validation, and Test Split

### 1.1 The preprocessed dataset is read from the csv file. The extracted features are converted from string representation of lists into actual lists of numbers. The lists of numbers are then converted into numpy arrays. The labels are also converted into numpy arrays.

In [1]:
import pandas as pd
import numpy as np
# Load the data
combined_df = pd.read_csv('dataset/Combined-Preprocessed-Dataset.csv')
# Randomly pick 100 data points from each class
combined_df = combined_df.groupby('label').apply(lambda x: x.sample(n=100, random_state=1)).reset_index(drop=True)

### 1.2 The data is split into train, validation, and test sets. The train set is 67.5% of the data, the validation set is 12.5% of the data, and the test set is 20% of the data. Validation set is used to tune the hyperparameters of the model. The test set is used to evaluate the model.

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split

# Load and preprocess data

# Modify labels for T5 classification
combined_df['t5_label'] = combined_df['label'].apply(lambda x: "positive </s>" if x == 1 else "negative </s>")

# Initialize T5 tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Encode texts and labels
texts = combined_df['text'].tolist()
labels = combined_df['t5_label'].tolist()

# Tokenize
input_encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt").to(device)
input_ids = input_encodings['input_ids']
attention_masks = input_encodings['attention_mask']

# Data Split
train_inputs, test_inputs, train_masks, test_masks = train_test_split(input_ids, attention_masks, test_size=0.2)
train_labels, test_labels = train_test_split(combined_df['t5_label'].values, test_size=0.2)
train_inputs, val_inputs, train_masks, val_masks = train_test_split(train_inputs, train_masks, test_size=0.125)
train_labels, val_labels = train_test_split(train_labels, test_size=0.125)



In [3]:
combined_df['t5_label'].describe()

count               200
unique                2
top       negative </s>
freq                100
Name: t5_label, dtype: object

# 2. Model Training

In [4]:
# Hyperparameters
epochs = 5
batch_size = 1 
learning_rate = 5e-4
weight_decay = 1e-3

optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_inputs) * epochs)
criterion = torch.nn.CrossEntropyLoss()

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_cosine_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt


# Lists to store metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
train_precisions = []
val_precisions = []
train_f1s = []
val_f1s = []

# Fine-tune T5 model
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    train_preds, train_true = [], []

    for i in range(0, len(train_inputs), batch_size):
        input_batch = train_inputs[i:i+batch_size].to(device)
        mask_batch = train_masks[i:i+batch_size].to(device)
        label_batch = torch.tensor(train_labels[i:i+batch_size]).to(device)


        optimizer.zero_grad()
        outputs = model(input_ids=input_batch, attention_mask=mask_batch, labels=label_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()

        logits = outputs.logits.argmax(dim=-1)[:, 0].detach().cpu().numpy()  # Take the first token's predicted ID
        labels = label_batch[:, 0].cpu().numpy()  # Take the first token's true ID
        train_preds.extend(logits)
        train_true.extend(labels)

    # Calculate metrics for training
    train_accuracy = accuracy_score(train_true, train_preds)
    train_precision = precision_score(train_true, train_preds, average='weighted')
    train_f1 = f1_score(train_true, train_preds, average='weighted')

    # Validation
    model.eval()
    val_loss = 0.0
    val_preds, val_true = [], []
    with torch.no_grad():
        for i in range(0, len(val_inputs), batch_size):
            input_batch = val_inputs[i:i+batch_size].to(device)
            label_batch = val_labels[i:i+batch_size].to(device)
            outputs = model(input_ids=input_batch, labels=label_batch)
            val_loss += outputs.loss.item()

            # For accuracy, precision, f1
            logits = outputs.logits.argmax(dim=-1)[:, 0].detach().cpu().numpy()  # Take the first token's predicted ID
            labels = label_batch[:, 0].cpu().numpy()  # Take the first token's true ID
            val_preds.extend(logits)
            val_true.extend(labels)

    # Calculate metrics for validation
    val_accuracy = accuracy_score(val_true, val_preds)
    val_precision = precision_score(val_true, val_preds, average='weighted')
    val_f1 = f1_score(val_true, val_preds, average='weighted')

    # Store metrics
    train_losses.append(running_loss/len(train_inputs))
    val_losses.append(val_loss/len(val_inputs))
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)
    train_precisions.append(train_precision)
    val_precisions.append(val_precision)
    train_f1s.append(train_f1)
    val_f1s.append(val_f1)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}")



TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
# Plotting metrics

plt.figure(figsize=(15, 5))

# Plotting Loss
plt.subplot(1, 4, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.title('Loss')
plt.legend()

# Plotting Accuracy
plt.subplot(1, 4, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Val Accuracy')
plt.title('Accuracy')
plt.legend()

# Plotting Precision
plt.subplot(1, 4, 3)
plt.plot(train_precisions, label='Train Precision')
plt.plot(val_precisions, label='Val Precision')
plt.title('Precision')
plt.legend()

# Plotting F1 Score
plt.subplot(1, 4, 4)
plt.plot(train_f1s, label='Train F1')
plt.plot(val_f1s, label='Val F1')
plt.title('F1 Score')
plt.legend()

plt.tight_layout()
plt.show()

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import seaborn as sns
# Evaluate the model on the test set
test_preds, test_true = [], []

with torch.no_grad():
    for i in range(0, len(test_inputs), batch_size):
        input_batch = test_inputs[i:i+batch_size].to(device)
        label_batch = test_labels[i:i+batch_size].to(device)

        outputs = model(input_ids=input_batch, labels=label_batch)
        # For accuracy, precision, f1
        logits = outputs.logits.argmax(dim=-1)[:, 0].detach().cpu().numpy()  # Take the first token's predicted ID
        labels = label_batch[:, 0].cpu().numpy()  # Take the first token's true ID
        test_preds.extend(logits)
        test_true.extend(labels)

# Calculate metrics for test set
test_accuracy = accuracy_score(test_true, test_preds)
test_precision = precision_score(test_true, test_preds, average='weighted')
test_recall = recall_score(test_true, test_preds, average='weighted')
test_f1 = f1_score(test_true, test_preds, average='weighted')

# Handle both binary and multi-class cases for FPR and FNR
confusion = confusion_matrix(test_true, test_preds)
if confusion.shape == (2, 2):  # Binary classification
    tn, fp, fn, tp = confusion.ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
else:  # Multi-class classification
    fp = confusion.sum(axis=0) - np.diag(confusion)
    fn = confusion.sum(axis=1) - np.diag(confusion)
    tp = np.diag(confusion)
    tn = confusion.sum() - (fp + fn + tp)
    fpr = fp.sum() / (fp.sum() + tn.sum())
    fnr = fn.sum() / (fn.sum() + tp.sum())

# Display metrics
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# ROC Curve & AUC
from sklearn.preprocessing import label_binarize
test_true_bin = label_binarize(test_true, classes=[0, 1])
test_preds_bin = label_binarize(test_preds, classes=[0, 1])
fpr_roc, tpr_roc, _ = roc_curve(test_true_bin, test_preds_bin)
roc_auc = auc(fpr_roc, tpr_roc)

plt.figure()
plt.plot(fpr_roc, tpr_roc, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# DET Curve
plt.figure()
plt.plot(fpr, fnr, color='darkorange', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('False Negative Rate')
plt.title('Detection Error Tradeoff (DET) Curve')
plt.show()

In [None]:
# Save the model
torch.save(model.state_dict(), 't5_sentinel_model.pth')