In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW, get_linear_schedule_with_warmup, set_seed
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import re
from sklearn.metrics import classification_report


In [None]:
# hyperparameters

seed = 69
batch_size = 32
epochs = 4
n_classes = 2
max_length = 128
learning_rate = 2e-5

In [None]:
path = 'data/train.csv'

set_seed(seed)

# Load the dataset
df = pd.read_csv(path)

#cleaning the data for upper case, special characters, and links
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
    return text

df['text'] = df['text'].apply(clean_text)

# Prepare the data
texts = df['text'].tolist()
targets = df['target'].tolist()

# Split into training and validation sets
train_texts, val_texts, train_targets, val_targets = train_test_split(
    texts, targets, test_size=0.1, random_state=seed
)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'


model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=n_classes)
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

print(f'Model loaded to {device}')

In [None]:

# Create custom dataset
class TweetDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length=max_length):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.long)
        }

# Create dataloaders
train_dataset = TweetDataset(train_texts, train_targets, tokenizer)
val_dataset = TweetDataset(val_texts, val_targets, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:

def train(model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(train_loader):

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        logits = outputs.logits
        # loss = torch.nn.functional.cross_entropy(logits, labels)
        loss = outputs.loss

        # loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)


    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions.double() / total_predictions
    return accuracy.item(), avg_loss

def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            # loss = outputs.loss
            logits = outputs.logits
            # loss = torch.nn.functional.cross_entropy(logits, labels)
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += len(labels)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    accuracy = correct_predictions.double() / total_predictions
    report = classification_report(all_labels, all_preds)
    return accuracy.item(), avg_loss, report



In [None]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
epochs = epochs
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

In [None]:
# train

# Training loop
all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}
print('Training')

for epoch in tqdm(range(epochs)):

    print()
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    print('Training on batches')
    train_acc, train_loss = train(model, train_loader, optimizer, scheduler, device)
    all_loss['train_loss'].append(train_loss)
    all_acc['train_acc'].append(train_acc)
    print(f'Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}')


    print('Evaluating on validation set')
    val_acc, val_loss, report = evaluate(model, val_loader, device)
    all_loss['val_loss'].append(val_loss)
    all_acc['val_acc'].append(val_acc)
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')
    print(report)




In [None]:
# prompt: plot loss and accuracy graphs for the network trained above

# Plot the loss curves
plt.figure(figsize=(10, 5))
plt.title("Training and Validation Loss")
plt.plot(all_loss["train_loss"], label="train")
plt.plot(all_loss["val_loss"], label="validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot the accuracy curves
plt.figure(figsize=(10, 5))
plt.title("Training and Validation Accuracy")
plt.plot(all_acc["train_acc"], label="train")
plt.plot(all_acc["val_acc"], label="validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
torch.save(model.state_dict(), '/Trained models/gpt.pt')