# MULTILABEL CLASSIFICATION

A multilabel setting is identified by samples that can simultaneously belong to more than one class. For example,  

In [1]:
# Download the dataset
!kaggle datasets list

ref                                                             title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
rahulvyasm/netflix-movies-and-tv-shows                          Netflix Movies and TV Shows                          1MB  2024-04-10 09:48:38          14187        300  1.0              
kapturovalexander/time-series-for-online-store                  🏪🏬🪫 Electronic store sales data                      9MB  2024-04-30 09:33:41            805         26  1.0              
sahirmaharajj/school-student-daily-attendance                   School Student Daily Attendance                      2MB  2024-04-29 19:29:56           2262         47  1.0              
jaidalmotra/pokemon-dataset                                     P

In [2]:
!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

Dataset URL: https://www.kaggle.com/datasets/shivanandmn/multilabel-classification-dataset
License(s): other
Downloading multilabel-classification-dataset.zip to c:\Users\Riccardo\Desktop\Dottorato\Ricerca\Terzo anno\Recognition systems - tutorato\Laboratori\Recognition-systems-labs\Lab_3




  0%|          | 0.00/11.4M [00:00<?, ?B/s]
  9%|▊         | 1.00M/11.4M [00:00<00:04, 2.43MB/s]
 17%|█▋        | 2.00M/11.4M [00:00<00:02, 3.70MB/s]
 26%|██▌       | 3.00M/11.4M [00:00<00:01, 4.59MB/s]
 35%|███▍      | 4.00M/11.4M [00:00<00:01, 5.48MB/s]
 44%|████▎     | 5.00M/11.4M [00:01<00:01, 6.22MB/s]
 52%|█████▏    | 6.00M/11.4M [00:01<00:00, 6.56MB/s]
 61%|██████    | 7.00M/11.4M [00:01<00:00, 7.07MB/s]
 70%|██████▉   | 8.00M/11.4M [00:01<00:00, 7.15MB/s]
 79%|███████▊  | 9.00M/11.4M [00:01<00:00, 7.61MB/s]
 87%|████████▋ | 10.0M/11.4M [00:01<00:00, 7.69MB/s]
 96%|█████████▌| 11.0M/11.4M [00:01<00:00, 7.89MB/s]
100%|██████████| 11.4M/11.4M [00:01<00:00, 6.38MB/s]


In [3]:
!tar -xf multilabel-classification-dataset.zip

In [1]:
# Let's start to explore the dataset
import pandas as pd
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [1]:
# First of all, we have to create the dataset
from dataset import MultiLabelDataset
from torch.utils.data import random_split

dataset = MultiLabelDataset(data_path="train.csv", split='train')
# Split the two in train and validation
train_dataset, val_dataset = random_split(dataset, [int(len(dataset)*0.9), len(dataset) - int(len(dataset)*0.9)])

Data loaded successfully, total number of train samples: 20972


In [2]:
# Create the network. We will start with a simple RNN based model. 
from model import SimpleRNNModel
import json

word_2_idx = json.load(open('w2i.json'))
# HYPER PARAMETERS
embedding_dim = 100
hidden_dim = 256
output_dim = dataset.__getnlabels__()
pad_idx = 0
vocab_size = len(word_2_idx)

In [4]:
import torch
# TRAIN LOOP 
# HYPERPARAMETERS
EPOCHS = 10
BATCH_SIZE = 32
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

# Model 
model = SimpleRNNModel(embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, vocab_size=vocab_size, pad_idx=pad_idx)
# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# Create the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Create the loss function
import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()

from tqdm import tqdm
from utils import convert_texts_to_indices

# Send the model to the GPU
model.train()

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        optimizer.zero_grad()
        out = model(titles_batch)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    print("Training loss: {}".format(round(train_loss/len(train_loader),4)))
    

100%|██████████| 590/590 [00:12<00:00, 47.33it/s]


Training loss: 0.4096


100%|██████████| 590/590 [00:11<00:00, 49.88it/s]


Training loss: 0.3439


100%|██████████| 590/590 [00:12<00:00, 47.23it/s]


Training loss: 0.3015


100%|██████████| 590/590 [00:12<00:00, 47.53it/s]


Training loss: 0.27


100%|██████████| 590/590 [00:16<00:00, 35.91it/s]


Training loss: 0.2451


100%|██████████| 590/590 [00:18<00:00, 32.14it/s]


Training loss: 0.2228


100%|██████████| 590/590 [00:17<00:00, 33.22it/s]


Training loss: 0.1983


100%|██████████| 590/590 [00:18<00:00, 32.65it/s]


Training loss: 0.1754


100%|██████████| 590/590 [00:17<00:00, 32.82it/s]


Training loss: 0.1524


100%|██████████| 590/590 [00:16<00:00, 36.80it/s]

Training loss: 0.1303





In [6]:
# Calculate the validation loss and accuracy 

BATCH_SIZE = 2
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(val_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        out = model(titles_batch)
        loss = criterion(out, labels)
        val_loss += loss.item()
        # Convert the output with sigmoid 
        out = torch.sigmoid(out)
        out = torch.round(out)
        # Calculate the accuracy
        for i in range(out.size(0)):
            if torch.equal(out[i], labels[i]):
                correct += 1
        total += labels.size(0)

print("Validation loss: {}".format(round(val_loss/len(val_loader),4)))
print("Validation accuracy: {}".format(round(correct/total,4)))

100%|██████████| 1049/1049 [00:03<00:00, 346.52it/s]

Validation loss: 0.3798
Validation accuracy: 0.5214





In [2]:
from transformers import AutoModelForSequenceClassification

target_labels = dataset.labels
id2label = {i:label for i, label in enumerate(target_labels)}
label2id = {label:i for i, label in enumerate(target_labels)}
# Model 
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(target_labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch
from utils import convert_texts_to_indices_bert
from tqdm import tqdm

# TRAIN LOOP 
# HYPERPARAMETERS
EPOCHS = 10
BATCH_SIZE = 32
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

## FREEZE THE WHOLE NETWORK ASIDE THE CLASSIFICATION LAYER
for name, param in model.named_parameters():
    if 'classifier' not in name:
        param.requires_grad = False

print("Model parameters: {}".format(sum(p.numel() for p in model.parameters() if p.requires_grad)))

# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# Create the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Create the loss function
import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()

# Send the model to the GPU
model.train()

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices_bert(texts=titles,max_len=512)
        input_ids = titles_batch['input_ids'].to(DEVICE)
        attention_mask = titles_batch['attention_mask'].to(DEVICE)
        optimizer.zero_grad()
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()
        
        train_loss+=loss.item()
     
    print("Training loss: {}".format(round(train_loss/len(train_loader),4)))

Model parameters: 4614


  1%|▏         | 8/590 [01:20<1:38:12, 10.12s/it]


KeyboardInterrupt: 