In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from transformers import logging
from tqdm import tqdm

logging.set_verbosity_error()
TOKENIZERS_PARALLELISM=False

In [2]:
dataset = pd.read_csv('./data/Edos.csv', nrows=10000)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

text = list(dataset['text'])
labels = torch.tensor(dataset['label_sexist'].map({'sexist': 1, 'not sexist': 0}).values, dtype=torch.float32)

In [3]:
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, train_size=0.5, random_state=42)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [5]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

epochs = 5

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    true_labels_list = []
    predicted_labels_list = []

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted = torch.round(torch.sigmoid(outputs.logits)).squeeze(-1)

        loss = criterion(outputs.logits.squeeze(-1), labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * input_ids.size(0)

        predicted_labels_list.extend(predicted.detach().cpu().numpy())
        true_labels_list.extend(labels.detach().cpu().numpy())

    epoch_loss /= len(train_loader.dataset)
    epoch_accuracy = accuracy_score(true_labels_list, predicted_labels_list)

    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/5: 100%|██████████| 50/50 [09:19<00:00, 11.19s/it]


Epoch [1/5], Loss: 0.5624, Accuracy: 0.7518


Epoch 2/5: 100%|██████████| 50/50 [08:55<00:00, 10.72s/it]


Epoch [2/5], Loss: 0.4297, Accuracy: 0.8062


Epoch 3/5: 100%|██████████| 50/50 [08:43<00:00, 10.47s/it]


Epoch [3/5], Loss: 0.2972, Accuracy: 0.8758


Epoch 4/5: 100%|██████████| 50/50 [08:52<00:00, 10.65s/it]


Epoch [4/5], Loss: 0.2362, Accuracy: 0.9068


Epoch 5/5: 100%|██████████| 50/50 [09:17<00:00, 11.16s/it]

Epoch [5/5], Loss: 0.1702, Accuracy: 0.9374





In [6]:
from sklearn.metrics import f1_score, confusion_matrix

predicted_labels = []
true_labels = []

test_loader = DataLoader(test_dataset, batch_size=50, shuffle=False)

model.eval()

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask)
        predicted = torch.round(torch.sigmoid(outputs.logits)).squeeze(-1).cpu().numpy()
        predicted_labels.extend(predicted)
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predicted_labels)
f1_score = f1_score(true_labels, predicted_labels)

print(f'Accuracy on test set: {accuracy * 100:.2f}%')
print(f'F1 score on test set: {f1_score}')

conf_matrix = confusion_matrix(true_labels, predicted_labels)

print(f'Confusion matrix:\n{conf_matrix}')


Accuracy on test set: 85.84%
F1 score on test set: 0.7177033492822966
Confusion matrix:
[[3392  279]
 [ 429  900]]
