In [1]:
import os
import torch

import pandas as pd
import numpy as np

import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from tqdm import tqdm

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW

In [2]:
data_dir = os.path.join(os.getcwd(), 'kaggle_data')

In [3]:
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device used : {device}')

Device used : cpu


## **Dataset**

In [4]:
class TransformerDataset(Dataset):
    def __init__(self, data_dir, mode, tokenizer_name='bert-base-uncased', max_length=128):
        super(TransformerDataset, self).__init__()
        assert mode in ['train', 'val', 'test']
        self.mode = mode

        self.data = pd.read_csv(os.path.join(data_dir, f'{mode}_x.csv'), index_col=0)
        self.data.fillna("", inplace=True)

        if self.mode != 'test':
            self.label = pd.read_csv(os.path.join(data_dir, f'{mode}_y.csv'))

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx, 0]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        if self.mode == 'test':
            return input_ids, attention_mask, idx
        else:
            y = torch.tensor([self.label.iloc[idx, -2]]).float()
            return input_ids, attention_mask, y, idx

## **Model**

In [5]:
class TransformerClassifier(nn.Module):
   
    def __init__(self, model_name='bert-base-uncased', hidden_dim=768):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  
        x = self.dropout(hidden_state)
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

## **Utils**

In [6]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

## **Training**

In [7]:
def train_model(model, optimizer, criterion, dataloader):
    model.train()
    losses, predictions, indices = [], [], []
    for input_ids, attention_mask, y, idx in tqdm(dataloader, leave=False):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        pred = model(input_ids, attention_mask)
        loss = criterion(pred.squeeze(), y.squeeze())
        loss.backward()
        optimizer.step()

        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().cpu().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, y=dataloader.dataset.label)
    return dataset_loss, dataset_metric

In [None]:
batch_size = 32
learning_rate = 1e-3
epochs = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = TransformerDataset(data_dir, 'train')
val_dataset = TransformerDataset(data_dir, 'val', tokenizer_name='bert-base-uncased')

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = TransformerClassifier(model_name='bert-base-uncased').to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()

for epoch in range(epochs):
    train_loss, train_metric = train_model(model, optimizer, criterion, train_dataloader)
    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Metric: {train_metric:.4f}")