# MULTILABEL CLASSIFICATION

A multilabel setting is identified by samples that can simultaneously belong to more than one class. For example,  

In [None]:
# Download the dataset
!kaggle datasets list

In [None]:
!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

In [None]:
!tar -xf multilabel-classification-dataset.zip

In [None]:
# Let's start to explore the dataset
import pandas as pd
df = pd.read_csv('train.csv')
df.head()

print(df.iloc[2])

In [None]:
# First of all, we have to create the dataset
from dataset import MultiLabelDataset
from torch.utils.data import random_split

dataset = MultiLabelDataset(data_path="train.csv", split='train')
# Split the two in train and validation
train_dataset, test_dataset = random_split(dataset, [int(len(dataset)*0.9), len(dataset) - int(len(dataset)*0.9)])
train_dataset, val_dataset = random_split(train_dataset, [int(len(train_dataset)*0.9), len(train_dataset) - int(len(train_dataset)*0.9)])

In [None]:
import json
word_2_idx = json.load(open('w2i.json'))
# Define some quantities
output_dim = dataset.__getnlabels__()
pad_idx = 0
vocab_size = len(word_2_idx)+1

In [None]:
# Try with the easiest model 

from model import EmbeddingMatrixModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
from utils import convert_texts_to_indices

# HYPERPARAMETERS
EMBEDDING_DIM = 100
EPOCHS = 20
BATCH_SIZE = 32
DEVICE = 'cuda:1a' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

# Model 
model = EmbeddingMatrixModel(embedding_dim=EMBEDDING_DIM, output_dim=output_dim, pad_idx=pad_idx, vocab_size=vocab_size)
# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# Create the optimizer
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Create the loss function
criterion = BCEWithLogitsLoss()

# Send the model to the GPU
model.train()

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        optimizer.zero_grad()
        out, loss_mask = model(titles_batch)
        out = out[loss_mask.squeeze()==1]
        labels = labels[loss_mask.squeeze()==1]
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # EVALUATE ON THE VALIDATION SPLIT 
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            titles, abst, labels = batch
            labels = labels.to(DEVICE)
            # Prepare the titles
            titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
            titles_batch = titles_batch.to(DEVICE)
            out, loss_mask = model(titles_batch)
            out = out[loss_mask.squeeze()==1]
            labels = labels[loss_mask.squeeze()==1]
            loss = criterion(out, labels)
            
            val_loss += loss.item()
        
    print("Training loss epoch {}: {}".format(epoch, round(train_loss/len(train_loader),4)))
    print("Validation loss epoch {}: {}".format(epoch, round(val_loss/len(val_loader),4)))
    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))
    

In [None]:
# Plot val losses and train losses
import matplotlib.pyplot as plt

plt.plot(train_losses, label='train')
plt.plot(val_losses, label='val')
plt.legend()
plt.show()

In [None]:
# Calculate the test loss and accuracy 

BATCH_SIZE = 128
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

model.eval()
val_loss = 0
correct = 0
total = 0

test_predictions = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        out, loss_mask = model(titles_batch)
        loss = criterion(out, labels)
        val_loss += loss.item()
        # Convert the output with sigmoid 
        out = torch.sigmoid(out)
        out = torch.round(out)
        # Calculate the accuracy
        for i in range(out.size(0)):
            if torch.equal(out[i], labels[i]):
                correct += 1
        total += labels.size(0)

        test_predictions.extend(out.tolist())
        test_labels.extend(labels.tolist())

print("Validation loss: {}".format(round(val_loss/len(test_loader),4)))
print("Validation accuracy: {}".format(round(correct/total,4)))

In [None]:
from sklearn.metrics import classification_report
import numpy as np

test_labels = np.array(test_labels)
test_predictions = np.array(test_predictions)
report = classification_report(test_labels,test_predictions)
print(report)

In [None]:
# Try with the easiest model 

from model import EmbeddingMatrixModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
from utils import convert_texts_to_indices

# HYPERPARAMETERS
EMBEDDING_DIM = 100
EPOCHS = 20
BATCH_SIZE = 32
DEVICE = 'cuda:1' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

# Model 
model = EmbeddingMatrixModel(embedding_dim=EMBEDDING_DIM, output_dim=output_dim, pad_idx=pad_idx, vocab_size=vocab_size)
# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# Create the optimizer
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE, weight_decay=1e-3)

# Create the loss function
criterion = BCEWithLogitsLoss()

# Send the model to the GPU
model.train()

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        optimizer.zero_grad()
        out, loss_mask = model(titles_batch)
        out = out[loss_mask.squeeze()==1]
        labels = labels[loss_mask.squeeze()==1]
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # EVALUATE ON THE VALIDATION SPLIT 
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            titles, abst, labels = batch
            labels = labels.to(DEVICE)
            # Prepare the titles
            titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
            titles_batch = titles_batch.to(DEVICE)
            out, loss_mask = model(titles_batch)
            out = out[loss_mask.squeeze()==1]
            labels = labels[loss_mask.squeeze()==1]
            loss = criterion(out, labels)
            
            val_loss += loss.item()
        
    print("Training loss epoch {}: {}".format(epoch, round(train_loss/len(train_loader),4)))
    print("Validation loss epoch {}: {}".format(epoch, round(val_loss/len(val_loader),4)))
    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))
    

In [None]:
# Plot val losses and train losses
import matplotlib.pyplot as plt

plt.plot(train_losses, label='train')
plt.plot(val_losses, label='val')
plt.legend()
plt.show()

In [None]:
# Calculate the test loss and accuracy 

BATCH_SIZE = 128
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

model.eval()
val_loss = 0
correct = 0
total = 0

test_predictions = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        out, loss_mask = model(titles_batch)
        loss = criterion(out, labels)
        val_loss += loss.item()
        # Convert the output with sigmoid 
        out = torch.sigmoid(out)
        out = torch.round(out)
        # Calculate the accuracy
        for i in range(out.size(0)):
            if torch.equal(out[i], labels[i]):
                correct += 1
        total += labels.size(0)

        test_predictions.extend(out.tolist())
        test_labels.extend(labels.tolist())

print("Validation loss: {}".format(round(val_loss/len(test_loader),4)))
print("Validation accuracy: {}".format(round(correct/total,4)))

In [None]:
from sklearn.metrics import classification_report
import numpy as np

test_labels = np.array(test_labels)
test_predictions = np.array(test_predictions)
report = classification_report(test_labels,test_predictions)
print(report)

In [None]:
# Do a step, use a RNN based model. 
from model import SimpleRNNModel
import json

word_2_idx = json.load(open('w2i.json'))
# HYPER PARAMETERS
embedding_dim = 100
hidden_dim = 256
output_dim = dataset.__getnlabels__()
pad_idx = 0
vocab_size = len(word_2_idx)+1

print(embedding_dim)
print(hidden_dim)
print(output_dim)
print(vocab_size)

In [None]:
import torch
from model import SimpleRNNModel
# TRAIN LOOP 
# HYPERPARAMETERS
EPOCHS = 10
BATCH_SIZE = 32
DEVICE = 'cuda:1' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 1e-3

# Model 
model = SimpleRNNModel(embedding_dim=embedding_dim, 
                       hidden_dim=hidden_dim, 
                       output_dim=output_dim, 
                       vocab_size=vocab_size, 
                       pad_idx=pad_idx)
# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# Create the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE, weight_decay=1e-3)

# Create the loss function
import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()

from tqdm import tqdm
from utils import convert_texts_to_indices

# Send the model to the GPU
model.train()

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        optimizer.zero_grad()
        out = model(titles_batch)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # EVALUATE ON THE VALIDATION SPLIT 
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            titles, abst, labels = batch
            labels = labels.to(DEVICE)
            # Prepare the titles
            titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
            titles_batch = titles_batch.to(DEVICE)
            out = model(titles_batch)
            loss = criterion(out, labels)
            
            val_loss += loss.item()
    
    model.train()
        
    print("Training loss epoch {}: {}".format(epoch, round(train_loss/len(train_loader),4)))
    print("Validation loss epoch {}: {}".format(epoch, round(val_loss/len(val_loader),4)))
    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))

In [None]:
# Calculate the test loss and accuracy 

BATCH_SIZE = 128
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

model.eval()
val_loss = 0
correct = 0
total = 0

test_predictions = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        titles, abst, labels = batch
        labels = labels.to(DEVICE)
        # Prepare the titles
        titles_batch = convert_texts_to_indices(texts=titles,word2idx=word_2_idx,pad_idx=pad_idx)
        titles_batch = titles_batch.to(DEVICE)
        out = model(titles_batch)
        loss = criterion(out, labels)
        val_loss += loss.item()
        # Convert the output with sigmoid 
        out = torch.sigmoid(out)
        out = torch.round(out)
        # Calculate the accuracy
        for i in range(out.size(0)):
            if torch.equal(out[i], labels[i]):
                correct += 1
        total += labels.size(0)

        test_predictions.extend(out.tolist())
        test_labels.extend(labels.tolist())

print("Validation loss: {}".format(round(val_loss/len(test_loader),4)))
print("Validation accuracy: {}".format(round(correct/total,4)))

In [None]:
from sklearn.metrics import classification_report
import numpy as np

test_labels = np.array(test_labels)
test_predictions = np.array(test_predictions)
report = classification_report(test_labels,test_predictions)
print(report)

In [None]:
# Load the dataset
from dataset import BERT_dataset
import torch
import random
from torch.utils.data import random_split

# FIX THE SEED
random.seed(45)
torch.manual_seed(0)

dataset = BERT_dataset(data_path="train.csv", split='train')
# Split the two in train and validation
train_dataset, test_dataset = random_split(dataset, [int(len(dataset)*0.9), len(dataset) - int(len(dataset)*0.9)])
train_dataset, val_dataset = random_split(train_dataset, [int(len(train_dataset)*0.9), len(train_dataset) - int(len(train_dataset)*0.9)])

In [None]:
target_labels = dataset.labels
id2label = {i:label for i, label in enumerate(target_labels)}
label2id = {label:i for i, label in enumerate(target_labels)}

In [None]:
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification
from dataset import custom_collate

# HYPERPARAMETERS
EPOCHS = 10
BATCH_SIZE = 32
DEVICE = 'cuda:1' if torch.cuda.is_available() else 'cpu'
LEARNING_RATE = 2e-5

# Model 
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(target_labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

# for name, param in model.named_parameters():
#     if "classifier" not in name:
#         param.requires_grad = False
        
print("Model parameters: {}".format(sum(p.numel() for p in model.parameters() if p.requires_grad)))

# Send the model to the GPU 
model.to(DEVICE)

# Create the dataloaders
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=8, collate_fn=custom_collate)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=8, collate_fn=custom_collate)

# Create the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Create the loss function
import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()

# Send the model to the GPU
model.train()

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    train_loss = 0
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = batch
        labels = labels.to(DEVICE)
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        optimizer.zero_grad()
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()
        
        train_loss+=loss.item()
    
    # EVALUATE ON THE VALIDATION SPLIT 
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids, attention_mask, labels = batch
            labels = labels.to(DEVICE)
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out.loss
            val_loss += loss.item()
    
    model.train()
        
    print("Training loss epoch {}: {}".format(epoch, round(train_loss/len(train_loader),4)))
    print("Validation loss epoch {}: {}".format(epoch, round(val_loss/len(val_loader),4)))
    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))

In [None]:
# Calculate the test loss and accuracy 

BATCH_SIZE = 128
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, collate_fn=custom_collate)

model.eval()
val_loss = 0
correct = 0
total = 0

test_predictions = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask, labels = batch
        labels = labels.to(DEVICE)
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        val_loss += loss.item()
        # Convert the output with sigmoid 
        out = torch.sigmoid(out["logits"])
        out = torch.round(out)
        # Calculate the accuracy
        for i in range(out.size(0)):
            if torch.equal(out[i], labels[i]):
                correct += 1
        total += labels.size(0)

        test_predictions.extend(out.tolist())
        test_labels.extend(labels.tolist())

print("Validation loss: {}".format(round(val_loss/len(test_loader),4)))
print("Validation accuracy: {}".format(round(correct/total,4)))

In [None]:
from sklearn.metrics import classification_report
import numpy as np

test_labels = np.array(test_labels)
test_predictions = np.array(test_predictions)
report = classification_report(test_labels,test_predictions)
print(report)