Welcome to this tutorial! Imagine that you are solving the problem of binary text classification on non-aggregated data. How to train a model on such data so as not to lose accuracy? In this notebook, we will solve this problem using the LSTM model and integrate the CoNAL and crowdlayer layers

First of all, let's install and import necessary libraries

In [1]:
%%capture
import pandas as pd
import numpy as np
%pip install crowd-kit
%pip install sentence_transformers
from crowdkit.learning import CoNAL
from crowdkit.learning import CrowdLayer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

It's important to boost our calculous speed so use the GPU

In [2]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


We are going to use non-aggregated data. So, read the train, val and test data from csv-files

In [3]:
train_data = pd.read_csv("fixed_train_crowd_alpha047 (1).csv")
val_data = pd.read_csv("fixed_val_clean_alpha047.csv")
test_data = pd.read_csv("fixed_test.csv")
pd.options.mode.chained_assignment = None

We need to prepare our data: choose the necessary columns and change labels from 'pos'-'neg' to '1'-'0'

In [4]:
train_data = train_data[['INPUT:text','OUTPUT:result','ASSIGNMENT:worker_id']]
train_data['OUTPUT:result'] = train_data['OUTPUT:result'].replace('pos',int(1))
train_data['OUTPUT:result'] = train_data['OUTPUT:result'].replace('neg',int(0))

val_data = val_data[["text", "label"]]
val_data['label'] = val_data['label'].replace('pos',int(1))
val_data['label'] = val_data['label'].replace('neg',int(0))

test_data = test_data[['text', 'label']]
test_data['label'] = test_data['label'].replace('pos',int(1))
test_data['label'] = test_data['label'].replace('neg',int(0))

Since the id_worker field has the string type, we need to convert it to the int type. To do this, we will assign a unique values to all unique numbers in the field

In [5]:
train_id_workers = train_data['ASSIGNMENT:worker_id'].tolist()
unique_id_workers = set(train_data['ASSIGNMENT:worker_id'])
id_workers_mapping = {id_worker: i for i, id_worker in enumerate(unique_id_workers)}

train_id_workers_indices = [id_workers_mapping[id_worker] for id_worker in train_id_workers]

Define a class for the train, val and test dataset

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, id_workers=None, load_id_workers=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.load_id_workers = load_id_workers
        if load_id_workers:
            self.id_workers = id_workers

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True)
        item = {
            'input_ids': torch.tensor(tokens['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(tokens['attention_mask'], dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }
        if self.load_id_workers:
            item['id_worker'] = torch.tensor(self.id_workers[idx], dtype=torch.long)
        
        return item


Define a tokenizer for the text - we take a ready-made one from Huggin Face and prepare dataloader

In [7]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

#Choose the batch_size
batch_size = 128
max_length = 256

train_dataset = TextDataset(train_data['INPUT:text'].tolist(), train_data['OUTPUT:result'].tolist(), tokenizer, max_length, train_id_workers_indices, load_id_workers=True)
val_dataset = TextDataset(val_data['text'].tolist(), val_data['label'].tolist(), tokenizer, max_length)
test_dataset = TextDataset(test_data['text'].tolist(), test_data['label'].tolist(), tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The most interesting thing! We define our model and integrate our CoNAL layer into it to learn the model with non-aggregated data

In [8]:
# LSTMClassifier
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.conal = CoNAL(2, 1107) # 2 - the number of classes, 1107 - the number of unique workers

    def forward(self, x, attention_mask=None, id_workers=None):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        out = self.fc(output[:, -1, :])
        if id_workers is not None:
            out = self.conal(output[:, -1, :], out, id_workers)
        return out

The standart train pipeline

In [9]:
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        id_workers = batch['id_worker'].to(device)

        logits = model(input_ids, attention_mask, id_workers)
        logits = model(input_ids, attention_mask)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(logits, 1)
        correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct_predictions.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc

We will also check out error on the validation dataset

In [10]:
def eval_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)

            loss = criterion(logits, labels)

            running_loss += loss.item()
            _, preds = torch.max(logits, 1)
            correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct_predictions.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc


Create an instance of the model, optimizer, and loss function

In [11]:
vocab_size = len(tokenizer.vocab)
embed_dim = 100
hidden_dim = 128
num_layers = 2
num_classes = 2

model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
torch.set_warn_always(False)



Choose the number of the epoches and run the learning process!

In [12]:
num_epochs = 5

best_val_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = eval_model(model, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pt')
        print("Model saved!")

Epoch 1/5
Train Loss: 0.6680 | Train Acc: 0.5984
Val Loss: 0.6984 | Val Acc: 0.5376
Model saved!
Epoch 2/5
Train Loss: 0.6178 | Train Acc: 0.6561
Val Loss: 0.6074 | Val Acc: 0.6932
Model saved!
Epoch 3/5
Train Loss: 0.5215 | Train Acc: 0.7523
Val Loss: 0.5538 | Val Acc: 0.7342
Model saved!
Epoch 4/5
Train Loss: 0.4611 | Train Acc: 0.7947
Val Loss: 0.5203 | Val Acc: 0.7501
Model saved!
Epoch 5/5
Train Loss: 0.4314 | Train Acc: 0.8120
Val Loss: 0.5372 | Val Acc: 0.7405


Let's calculate the final error on the test dataset

In [13]:
best_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
best_model.load_state_dict(torch.load('best_model.pt'))
best_model = best_model.to(device)

test_loss, test_acc = eval_model(best_model, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

Test Loss: 0.5281 | Test Acc: 0.7441


So, we trained our model on non-aggregated data using the CoNAL layer and got a good result.

Now, let's see how the model will deal with crowdlayer. 

In [None]:
# LSTMClassifier
class LSTMClassifier_crowdlayer(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.crowdlaeyr = CrowdLayer(2, 1107, conn_type="mw") #See al optiones here https://toloka.ai/docs/crowd-kit/reference/crowdkit.learning.crowd_layer.CrowdLayer/

    def forward(self, x, attention_mask=None, id_workers=None):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        out = self.fc(output[:, -1, :])
        if id_workers is not None:
            out = self.crowdlaeyr(out, id_workers)
        return out

Create a new model with the same parameters

In [15]:
model_with_crowdlayer = LSTMClassifier_crowdlayer(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
model_with_crowdlayer = model_with_crowdlayer.to(device)
optimizer = optim.Adam(model_with_crowdlayer.parameters(), lr=0.001)

In [16]:
num_epochs = 5

best_val_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model_with_crowdlayer, train_loader, criterion, optimizer, device)
    val_loss, val_acc = eval_model(model_with_crowdlayer, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model_with_crowdlayer.state_dict(), 'best_model_with_crowdlayer.pt')
        print("Model saved!")

Epoch 1/5
Train Loss: 0.6686 | Train Acc: 0.5965
Val Loss: 0.6506 | Val Acc: 0.6371
Model saved!
Epoch 2/5
Train Loss: 0.6002 | Train Acc: 0.6808
Val Loss: 0.6259 | Val Acc: 0.6586
Model saved!
Epoch 3/5
Train Loss: 0.5213 | Train Acc: 0.7532
Val Loss: 0.4736 | Val Acc: 0.7839
Model saved!
Epoch 4/5
Train Loss: 0.4532 | Train Acc: 0.8009
Val Loss: 0.4751 | Val Acc: 0.7851
Model saved!
Epoch 5/5
Train Loss: 0.4230 | Train Acc: 0.8163
Val Loss: 0.4560 | Val Acc: 0.7990
Model saved!


Let's check the final error on the test dataset

In [17]:
best_model_with_crowdlayer = LSTMClassifier_crowdlayer(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
best_model_with_crowdlayer.load_state_dict(torch.load('best_model_with_crowdlayer.pt'))
best_model_with_crowdlayer = best_model_with_crowdlayer.to(device)

# Подсчет ошибки и точности на тестовом наборе данных
test_loss, test_acc = eval_model(best_model_with_crowdlayer, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

Test Loss: 0.4524 | Test Acc: 0.8019


In this tutorial, we looked at how to train a model on non-aggregated data without losing accuracy on a test dataset using layers from the Crowd-Kit library