# DistilBERT

### Imports and Initial Setup

In [None]:
import torch
import json
import pandas as pd

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

### Load and Tokenize Data

In [None]:
train_df = pd.read_csv('../data/train_data.csv')
test_df = pd.read_csv('../data/test_data.csv')

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_df['review']), truncation = True, padding = True, max_length = 512, return_tensors = 'pt')
test_encodings = tokenizer(list(test_df['review']), truncation = True, padding = True, max_length = 512, return_tensors = 'pt')

train_labels = torch.tensor(train_df['sentiment'].values)
test_labels = torch.tensor(test_df['sentiment'].values)

### Create DataLoaders and Initialize Model, Optimizer, Scheduler

In [None]:
# Create DataLoaders
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
train_dataloader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 32)

# Device setup and model initialization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2).to(device)
optimizer = AdamW(model.parameters(), lr = 2e-5)

# Initialize scheduler
scheduler = StepLR(optimizer, step_size = 1, gamma = 0.9)

### Training Loop with Evaluation and Metrics Collection

In [None]:
metrics_per_epoch = []

# Training loop
for epoch in range(5):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
        optimizer.zero_grad()
        outputs.loss.backward()
        optimizer.step()
    
    # Evaluation
    model.eval()
    total_loss, preds, true_labels = 0, [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
            total_loss += outputs.loss.item()
            preds.extend(torch.argmax(outputs.logits, dim = 1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Metrics
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average = 'binary')
    acc = accuracy_score(true_labels, preds)
    avg_loss = total_loss / len(test_dataloader)
    print(f"Epoch {epoch + 1} - Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

    metrics_per_epoch.append({
        'epoch': epoch + 1, 'loss': avg_loss, 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1
    })

    # Adjust learning rate
    scheduler.step()

### Saving Metrics and Model

In [None]:
with open('metrics_per_epoch.json', 'w') as f:
    json.dump(metrics_per_epoch, f, indent = 4)

model.save_pretrained('../models/distilbert_model')