# Learning ML model on non-aggregated data

Welcome to this tutorial!

We all know that data is required to train the model, and the quality of this data greatly affects the final result. Training a model on non-aggregated data usually leads to a loss in model quality, so the Toloka team has implemented special layers for sublimating such data. In this notebook, we will solve this problem using the LSTM model and integrate the CoNAL and CrowdLayer layers from Crowd-Kit library!

## Libraries importing

First of all, let's install and import necessary libraries

In [3]:
%%capture
%pip install crowd-kit
%pip install sentence_transformers
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from crowdkit.learning import CoNAL
from crowdkit.learning import CrowdLayer
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

## Boosting with cuda

It's important to boost our calculation speed so use the GPU

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Data preparation

We are going to use non-aggregated data. So, read the *train*, *val* and *test* data from csv-files

In [5]:
train_data = pd.read_csv("fixed_train_crowd_alpha047 (1).csv")
val_data = pd.read_csv("fixed_val_clean_alpha047.csv")
test_data = pd.read_csv("fixed_test.csv")
pd.options.mode.chained_assignment = None

Preparing our data: choose the necessary columns and change labels from '*pos*'-'*neg*' to '*1*'-'0'

In [6]:
train_data = train_data[['INPUT:text', 'OUTPUT:result', 'ASSIGNMENT:worker_id']]
train_data['OUTPUT:result'] = train_data['OUTPUT:result'].replace('pos', int(1))
train_data['OUTPUT:result'] = train_data['OUTPUT:result'].replace('neg', int(0))

val_data = val_data[["text", "label"]]
val_data['label'] = val_data['label'].replace('pos', int(1))
val_data['label'] = val_data['label'].replace('neg', int(0))

test_data = test_data[['text', 'label']]
test_data['label'] = test_data['label'].replace('pos', int(1))
test_data['label'] = test_data['label'].replace('neg', int(0))

Since the **id_worker** field has the string type, we need to convert it to the int type. To do this, we will assign a unique string values to all unique int values in the field

In [35]:
train_id_workers = train_data['ASSIGNMENT:worker_id']
train_id_workers_indices, unique_ids = pd.factorize(train_id_workers)

Define a **class** for the *train*, *val* and *test* dataset

In [21]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, id_workers=None, load_id_workers=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.load_id_workers = load_id_workers
        if load_id_workers:
            self.id_workers = id_workers

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True)
        item = {
            'input_ids': torch.tensor(tokens['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(tokens['attention_mask'], dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }
        if self.load_id_workers:
            item['id_worker'] = torch.tensor(self.id_workers[idx], dtype=torch.long)
        
        return item

Define a **tokenizer** for the text - we take a ready-made one from Hugging Face and prepare dataloader

In [36]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Choose the batch_size
batch_size = 128
max_length = 256

train_dataset = TextDataset(train_data['INPUT:text'], train_data['OUTPUT:result'], tokenizer, max_length, train_id_workers_indices, load_id_workers=True)
val_dataset = TextDataset(val_data['text'], val_data['label'], tokenizer, max_length)
test_dataset = TextDataset(test_data['text'], test_data['label'], tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

## The model with CoNAL layer

The most interesting thing! We define our *model* and integrate our **CoNAL** layer into it to learn the model with non-aggregated data. 

*Common Noise Adaptation Layers* (CoNAL) introduces two types of confusions: worker-specific and global. Each is parameterized by a confusion matrix. The ratio of the two confusions is determined by the *common noise adaptation layer*. The *common noise adaptation layer* is a trainable function that takes the instance embedding and the worker ID as input and outputs a scalar value between 0 and 1.

In [23]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.conal = CoNAL(2, 1107)  # 2 - the number of classes, 1107 - the number of unique workers

    def forward(self, x, attention_mask=None, id_workers=None):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        out = self.fc(output[:, -1, :])
        if id_workers is not None:
            out = self.conal(output[:, -1, :], out, id_workers)
        return out

The standart train pipeline

In [24]:
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        id_workers = batch['id_worker'].to(device)

        logits = model(input_ids, attention_mask, id_workers)
        logits = model(input_ids, attention_mask)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(logits, 1)
        correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct_predictions.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc

We will also check out *loss* on the validation dataset

In [25]:
def eval_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)

            loss = criterion(logits, labels)

            running_loss += loss.item()
            _, preds = torch.max(logits, 1)
            correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct_predictions.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc

Create an instance of the model, optimizer, and loss-function

In [None]:
vocab_size = len(tokenizer.vocab)
embed_dim = 100
hidden_dim = 128
num_layers = 2
num_classes = 2

model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
torch.set_warn_always(False)

Choose the *number of the epoches* and run the learning process!

In [None]:
num_epochs = 5

best_val_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = eval_model(model, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pt')
        print("Model saved!")

Let's calculate the **final loss** on the test dataset

In [28]:
best_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
best_model.load_state_dict(torch.load('best_model.pt'))
best_model = best_model.to(device)

test_loss, test_acc = eval_model(best_model, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

Test Loss: 0.5155 | Test Acc: 0.7681


So, we trained our model on non-aggregated data using the **CoNAL** layer and got a good result.

## The model with crowdlayer

Now, let's see how the model will deal with **crowdlayer**. 

It applies a worker-specific transformation of the logits. There are four types of transformations:

- **MW**: Multiplication on the worker's confusion matrix.
- **VW**: Element-wise multiplication with the worker's weight vector.
- **VB**: Element-wise addition with the worker's bias vector.
- **VW** + b: Combination of VW and VB: VW * logits + b.

In [29]:
class LSTMClassifier_crowdlayer(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.crowdlaeyr = CrowdLayer(2, 1107, conn_type="mw")

    def forward(self, x, attention_mask=None, id_workers=None):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        out = self.fc(output[:, -1, :])
        if id_workers is not None:
            out = self.crowdlaeyr(out, id_workers)
        return out

Create a *new* model with the same parameters

In [30]:
model_with_crowdlayer = LSTMClassifier_crowdlayer(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
model_with_crowdlayer = model_with_crowdlayer.to(device)
optimizer = optim.Adam(model_with_crowdlayer.parameters(), lr=0.001)

Use the same *number of epochs* so that both models are in equal conditions

In [None]:
num_epochs = 5

best_val_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model_with_crowdlayer, train_loader, criterion, optimizer, device)
    val_loss, val_acc = eval_model(model_with_crowdlayer, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model_with_crowdlayer.state_dict(), 'best_model_with_crowdlayer.pt')
        print("Model saved!")

Let's check the **final loss** on the test dataset

In [32]:
best_model_with_crowdlayer = LSTMClassifier_crowdlayer(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)
best_model_with_crowdlayer.load_state_dict(torch.load('best_model_with_crowdlayer.pt'))
best_model_with_crowdlayer = best_model_with_crowdlayer.to(device)

test_loss, test_acc = eval_model(best_model_with_crowdlayer, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

Test Loss: 0.4677 | Test Acc: 0.7817


## Summary

In this tutorial, we looked at how to train a model on non-aggregated data without losing accuracy on a test dataset using layers from the Crowd-Kit library