## Baseline
This notebook implements a baseline model, which shows you how to handle the data and to provide a first very simple solution to the problem. You may re-use and modify any part of this notebook.

In [1]:
import os
import csv
import torch
import pickle
import re
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from ipywidgets import FloatProgress
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchmetrics import AUROC, F1Score
from transformers import BertTokenizer, BertModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

In [2]:
data_dir = os.path.join(os.getcwd(), 'kaggle_data')

In [3]:
torch.manual_seed(0)

<torch._C.Generator at 0x1c8db8230f0>

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device used : {device}')

Device used : cuda


We first start by defining the dataset class which takes as input the path to the data and the mode (`train`, `val`, or `train`). This fits a count vectorizer using the training set, and uses it on the validation and test sets.

In [5]:
def clean(comment):
    comment = comment.lower()
    comment = re.sub("\\n", " ", comment)
    comment = re.sub("\\r", " ", comment)
    return comment

In [6]:
class BertVectorizer:
    def __init__(self, tokenizer, bert_model):
        self.tokenizer = tokenizer
        self.bert_model = bert_model

    def transform(self, texts):
        # Tokenize and pad the text
        tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
            outputs = self.bert_model(**tokens)
        # Use the [CLS] token's output as the sentence embedding
        embeddings = outputs.last_hidden_state[:, 0, :]
        return embeddings

In [10]:
class BaselineDataset(Dataset):
    
    def __init__(self, data_dir, mode, vectorizer=None):
        super(BaselineDataset, self).__init__()
        assert mode in ['train', 'val', 'test']
        self.mode = mode

        # load the data
        self.data = pd.read_csv(os.path.join(data_dir, f'{mode}_x.csv'), index_col=0)

        # Handle NaN values
        self.data.fillna("", inplace=True)

        #Clean data
        self.data["string"] = self.data["string"].apply(lambda x: clean(x))

        # load the labels if not the test set
        if self.mode != 'test':
            self.label = pd.read_csv(os.path.join(data_dir, f'{mode}_y.csv'))

        # train the vectorizer if train set
        self.vectorizer = vectorizer
        # if self.mode == 'train':
        #     self.vectorizer.fit(self.data.values.flatten().tolist())
            

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data.iloc[idx, 0]
        x = self.vectorizer.transform([x])
        if self.mode == 'test':
            return x, idx
        else:
            y = torch.tensor([self.label.iloc[idx, -2]])
            return x, y, idx

In [11]:
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
bert_model = BertModel.from_pretrained(BERT_MODEL)
vectorizer = BertVectorizer(tokenizer, bert_model)
train_dataset = BaselineDataset(data_dir, 'train', vectorizer)
val_dataset = BaselineDataset(data_dir, 'val', train_dataset.vectorizer)

In [12]:
print(f"Number of comments: {len(train_dataset)}\nNumber of de words in the vocabulary (number of features) : {len(train_dataset[0][0][0])}")

Number of comments: 269038
Number of de words in the vocabulary (number of features) : 768


The CountVectorizer generates a 269038 x 126364 matrix. \
Each row represents a tweet and each column corresponds to a word in the vocabulary. \
The value at position (i,j) indicates how many times word j appears in tweet i

In [26]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

We will define two models, one which will be a simple MLP, and another one which will generate random predictions to use as comparison.

In [14]:
class BaselineClassifier(nn.Module):
    
    def __init__(self, input_size, hidden_dim):
        super(BaselineClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        return x

In [15]:
class ImprovedClassifier(nn.Module):

    def __init__(self, input_size, hidden_dim):
        super(ImprovedClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc3(x))

        return x

In [16]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers, dropout_prob=0.3):
        super(RNNClassifier, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_prob if num_layers > 1 else 0.0,
            bidirectional=True
        )
        
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)  # Normalize LSTM output
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout_prob)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, x, lengths=None):
        if lengths is not None:
            x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        
        lstm_out, _ = self.lstm(x)
        
        if lengths is not None:
            lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        
        lstm_out = self.layer_norm(lstm_out[:, -1, :])  # Last hidden state
        
        # Fully connected layers
        x = F.relu(self.fc1(lstm_out))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))  # Binary classification output
        
        return x

In [17]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [18]:
class RandomClassifier(nn.Module):
    def __init__(self):
        super(RandomClassifier, self).__init__()

    def forward(self, x):
        x = torch.rand(len(x))
        return x

Let's check the performance of the random classifier on the validation set.

In [19]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

Quelle est l'accuracy selon le groupe en question (male, female, chistian, etc...) \
On veut minimiser la loss et maximiser la wga

In [20]:
@torch.no_grad()
def evaluate_model(model, dataloader, criterion):
    """
        Evaluate the model on a given dataloader.
        argument:
            model [torch.nn.Module]: model to evaluate
            dataloader [torch.utils.data.DataLoader]: dataloader on which to evaluate
            criterion [torch.nn.modules.loss]: desired loss to compute
        returns:
            dataset_loss [float]: computed loss on the dataset
            dataset_metric [float]: computed metric on the dataset
    """
    model.eval()
    losses, predictions, indices = [], [], []
    for x, y, idx in tqdm(dataloader, leave=False):
        pred = model(x.to(device))
        loss = criterion(pred.to(device).squeeze(), y.to(device).squeeze().float())
        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, dataloader.dataset.label)
    return dataset_loss, dataset_metric

Now let's train the MLP baseline classifier.

In [21]:
def train_model(model, optimizer, criterion, dataloader):
    """
        Train a model for one epoch.
        arguments:
            model [torch.nn.Module]: model to evaluate
            oprimizer [torch.optim]: optimizer used for training
            criterion [torch.nn.modules.loss]: desired loss to compute
            dataloader [torch.utils.data.DataLoader]: dataloader used for training
        returns:
            dataset_loss [float]: computed loss on the dataset
            dataset_metric [float]: computed metric on the dataset
    """
    model.train()
    losses, predictions, indices = [], [], []
    dataset_loss, dataset_metric = [], []
    for x, y, idx in tqdm(dataloader, leave=False):
        optimizer.zero_grad()
        pred = model(x.to(device))
        loss = criterion(pred.to(device).squeeze(), y.to(device).squeeze().float())
        loss.backward()
        optimizer.step()

        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, y = dataloader.dataset.label)
    return dataset_loss, dataset_metric

In [22]:
len(train_dataset.vectorizer.get_feature_names_out())

AttributeError: 'BertVectorizer' object has no attribute 'get_feature_names_out'

In [27]:
# input_size = len(train_dataset.vectorizer.get_feature_names_out())
LR = 1e-3
epochs = 1
#model = ImprovedClassifier(input_size=input_size, hidden_dim=128).to(device)
model = RNNClassifier(input_size=768, hidden_dim=128, num_layers=2).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LR)
criterion = nn.BCELoss()

In [28]:
for epoch in range(epochs):
    train_losses, train_metrics = train_model(model, optimizer, criterion, train_dataloader)
    mlp_val_loss, mlp_val_metric = evaluate_model(model, val_dataloader, criterion)
    print(f'Classifier validation loss {mlp_val_loss:.4f} WGA {mlp_val_metric:.4f}')

                                                  

KeyboardInterrupt: 

Once we are happy with our results, we want to make a prediction on the test set. Your submission `.csv` file should contain 2 columns:
- ID: with the id of each prediction (do not shuffle to not mix things up)
- pred: the prediction of the model (thresholded or not)

In [49]:
#model = RandomClassifier()
test_dataset = BaselineDataset(data_dir, 'test', train_dataset.vectorizer)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_predictions, indices = [], []

for x, idx in tqdm(test_dataloader, leave=False):
    with torch.no_grad():
        pred = (model(x.to(device)).cpu().squeeze() > 0.5).int()
    test_predictions.extend(pred.tolist())
    indices.extend(idx.tolist())

                                                   

In [50]:
pred_df = pd.DataFrame({'ID': indices, 'pred': test_predictions})
pred_df.to_csv('prediction.csv', index=False)