# MULTILABEL CLASSIFICATION

A multilabel setting is identified by samples that can simultaneously belong to more than one class. For example,  

In [None]:
# Download the dataset
!kaggle datasets list

In [None]:
!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

In [None]:
!tar -xf multilabel-classification-dataset.zip

In [None]:
# Let's start to explore the dataset
import pandas as pd
df = pd.read_csv('train.csv')
df.head()

In [None]:
# First of all, we have to create the dataset
from dataset import MultiLabelDataset
from torch.utils.data import random_split

dataset = MultiLabelDataset(data_path="train.csv", split='train')
# Split the two in train and validation
train_dataset, val_dataset = random_split(dataset, [int(len(dataset)*0.9), len(dataset) - int(len(dataset)*0.9)])

In [None]:
# Create the network. We will start with a simple RNN based model. 
from model import SimpleRNNModel
import json

word_2_idx = json.load(open('w2i.json'))
# HYPER PARAMETERS
embedding_dim = 100
hidden_dim = 256
output_dim = dataset.__getnlabels__()
pad_idx = 0
vocab_size = len(word_2_idx)

In [None]:
import torch
# TRAIN LOOP 
# HYPERPARAMETERS
EPOCHS = 10
BATCH_SIZE = 32
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

# Model 
model = SimpleRNNModel(embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, vocab_size=vocab_size, pad_idx=pad_idx)
# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# Create the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Create the loss function
import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()

from tqdm import tqdm
from utils import convert_texts_to_indices

# Send the model to the GPU
model.train()

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        optimizer.zero_grad()
        out = model(titles_batch)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    print("Training loss: {}".format(round(train_loss/len(train_loader),4)))
    

In [None]:
# Calculate the validation loss and accuracy 

BATCH_SIZE = 2
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(val_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        out = model(titles_batch)
        loss = criterion(out, labels)
        val_loss += loss.item()
        # Convert the output with sigmoid 
        out = torch.sigmoid(out)
        out = torch.round(out)
        # Calculate the accuracy
        for i in range(out.size(0)):
            if torch.equal(out[i], labels[i]):
                correct += 1
        total += labels.size(0)

print("Validation loss: {}".format(round(val_loss/len(val_loader),4)))
print("Validation accuracy: {}".format(round(correct/total,4)))

In [None]:
from transformers import AutoModelForSequenceClassification

target_labels = dataset.labels
id2label = {i:label for i, label in enumerate(target_labels)}
label2id = {label:i for i, label in enumerate(target_labels)}
# Model 
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(target_labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
import torch
from utils import convert_texts_to_indices_bert
from tqdm import tqdm

# TRAIN LOOP 
# HYPERPARAMETERS
EPOCHS = 10
BATCH_SIZE = 32
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

## FREEZE THE WHOLE NETWORK ASIDE THE CLASSIFICATION LAYER
for name, param in model.named_parameters():
    print(name)
    if 'classifier' not in name:
        param.requires_grad = False

print("Model parameters: {}".format(sum(p.numel() for p in model.parameters() if p.requires_grad)))

# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# Create the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Create the loss function
import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()

# Send the model to the GPU
model.train()

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices_bert(texts=titles,max_len=512)
        input_ids = titles_batch['input_ids'].to(DEVICE)
        attention_mask = titles_batch['attention_mask'].to(DEVICE)
        optimizer.zero_grad()
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()
        
        train_loss+=loss.item()
     
    print("Training loss: {}".format(round(train_loss/len(train_loader),4)))