## Baseline
This notebook implements a baseline model, which shows you how to handle the data and to provide a first very simple solution to the problem. You may re-use and modify any part of this notebook.

In [1]:
import os
import csv
import torch
import pickle
import re
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from torch.amp import GradScaler, autocast
from torch.nn.utils import clip_grad_norm_
from ipywidgets import FloatProgress
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchmetrics import AUROC, F1Score
from transformers import BertTokenizer, BertModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

In [2]:
data_dir = os.path.join(os.getcwd(), 'kaggle_data')

In [3]:
torch.manual_seed(0)

<torch._C.Generator at 0x1d1d86430b0>

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device used : {device}')

Device used : cuda


We first start by defining the dataset class which takes as input the path to the data and the mode (`train`, `val`, or `train`). This fits a count vectorizer using the training set, and uses it on the validation and test sets.

In [5]:
def clean(comment):
    comment = comment.lower()
    comment = re.sub("\\n", " ", comment)
    comment = re.sub("\\r", " ", comment)
    return comment

In [6]:
class BaselineDataset(Dataset):
    def __init__(self, data_dir, mode, vectorizer=None):
        super(BaselineDataset, self).__init__()
        assert mode in ['train', 'val', 'test']
        self.mode = mode

        # Load data
        self.data = pd.read_csv(os.path.join(data_dir, f'{mode}_x.csv'), index_col=0)
        self.data.fillna("", inplace=True)
        self.data["string"] = self.data["string"].apply(lambda x: clean(x))

        # Load labels
        if self.mode != 'test':
            self.label = pd.read_csv(os.path.join(data_dir, f'{mode}_y.csv'))

        # Vectorizer
        if self.mode == 'train':
            self.vectorizer = vectorizer
            self.data_transformed = self.vectorizer.fit_transform(self.data["string"])
        else:
            self.vectorizer = vectorizer
            self.data_transformed = self.vectorizer.transform(self.data["string"])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = torch.tensor(self.data_transformed[idx].toarray()).float()
        if self.mode == 'test':
            return x, idx
        else:
            y = torch.tensor([self.label.loc[idx, 'y']])
            return x, y, idx

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')
train_dataset = BaselineDataset(data_dir, 'train', vectorizer)
val_dataset = BaselineDataset(data_dir, 'val', train_dataset.vectorizer)

In [8]:
print(f"Number of comments: {len(train_dataset)}\nNumber of de words in the vocabulary (number of features) : {len(train_dataset[0][0][0])}")

Number of comments: 269038
Number of de words in the vocabulary (number of features) : 126050


The CountVectorizer generates a 269038 x 126364 matrix. \
Each row represents a tweet and each column corresponds to a word in the vocabulary. \
The value at position (i,j) indicates how many times word j appears in tweet i

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

We will define two models, one which will be a simple MLP, and another one which will generate random predictions to use as comparison.

In [10]:
class BaselineClassifier(nn.Module):
    
    def __init__(self, input_size, hidden_dim):
        super(BaselineClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        return x

In [11]:
class ImprovedClassifier(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(ImprovedClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.ln2 = nn.LayerNorm(hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)
        self.dropout = nn.Dropout(0.5)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        # Xavier initialization for linear layers
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.ln1(F.relu(self.fc1(x)))
        x = self.dropout(x)
        x = self.ln2(F.relu(self.fc2(x)))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc3(x))  # Raw logits for BCEWithLogitsLoss
        return x

In [12]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers, dropout_prob=0.3):
        super(RNNClassifier, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_prob if num_layers > 1 else 0.0,
            bidirectional=True
        )
        
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)  # Normalize LSTM output
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout_prob)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, x, lengths=None):
        if lengths is not None:
            x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        
        lstm_out, _ = self.lstm(x)
        
        if lengths is not None:
            lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        
        lstm_out = self.layer_norm(lstm_out[:, -1, :])  # Last hidden state
        
        # Fully connected layers
        x = F.relu(self.fc1(lstm_out))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))  # Binary classification output
        
        return x

Let's check the performance of the random classifier on the validation set.

In [13]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

Quelle est l'accuracy selon le groupe en question (male, female, chistian, etc...) \
On veut minimiser la loss et maximiser la wga

Now let's train the MLP baseline classifier.

In [19]:
input_size = len(train_dataset.vectorizer.get_feature_names_out())
LR = 1e-5
epochs = 3
val_epoch=1
# model = ImprovedClassifier(input_size=768, hidden_dim=128).to(device)
model = RNNClassifier(input_size=input_size, hidden_dim=128, num_layers=2).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()

In [20]:
scaler = GradScaler('cuda')

# Initialize early stopping parameters
patience = 3
best_val_loss = float('inf')
early_stop_counter = 0

# Training Loop
for epoch in range(epochs):
    model.train()
    train_losses, train_predictions, train_indices = [], [], []
    
    for x, y, idx in tqdm(train_dataloader, leave=False):
        optimizer.zero_grad()
        pred = model(x.to(device))
        loss = criterion(pred.squeeze(), y.to(device).squeeze().float())
        loss.backward()
        optimizer.step()

        train_losses.extend([loss.item()] * len(y))
        train_predictions.extend(pred.detach().squeeze().tolist())
        train_indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': train_indices, 'pred': train_predictions})
    train_loss = np.mean(train_losses)
    train_wga = worst_group_accuracy(pred_df, y=train_dataloader.dataset.label)
    print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, WGA: {train_wga:.4f}')

    # Validation Phase
    if epoch % val_epoch == 0:
        model.eval()
        val_losses, val_predictions, val_indices = [], [], []
        for x, y, idx in tqdm(val_dataloader, leave=False):
            with torch.no_grad():
                pred = model(x.to(device))
                loss = criterion(pred.squeeze(), y.to(device).squeeze().float())
                val_losses.extend([loss.item()] * len(y))
                val_predictions.extend(pred.detach().squeeze().tolist())
                val_indices.extend(idx.tolist())

        pred_df = pd.DataFrame({'index': val_indices, 'pred': val_predictions})
        val_loss = np.mean(val_losses)
        val_wga = worst_group_accuracy(pred_df, val_dataloader.dataset.label)
        print(f'Validation Loss: {val_loss:.4f}, WGA: {val_wga:.4f}')

        # Early Stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            early_stop_counter += 1

        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

                                                   

Epoch 1/3 - Train Loss: 0.6970, WGA: 0.6888


                                                 

Validation Loss: 0.6931, WGA: 0.6774


                                                   

Epoch 2/3 - Train Loss: 0.6931, WGA: 0.6856


                                                 

Validation Loss: 0.6931, WGA: 0.6774


                                                   

KeyboardInterrupt: 

Once we are happy with our results, we want to make a prediction on the test set. Your submission `.csv` file should contain 2 columns:
- ID: with the id of each prediction (do not shuffle to not mix things up)
- pred: the prediction of the model (thresholded or not)

In [49]:
#model = RandomClassifier()
test_dataset = BaselineDataset(data_dir, 'test', train_dataset.vectorizer)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_predictions, indices = [], []

for x, idx in tqdm(test_dataloader, leave=False):
    with torch.no_grad():
        pred = (model(x.to(device)).cpu().squeeze() > 0.5).int()
    test_predictions.extend(pred.tolist())
    indices.extend(idx.tolist())

                                                   

In [50]:
pred_df = pd.DataFrame({'ID': indices, 'pred': test_predictions})
pred_df.to_csv('prediction.csv', index=False)