# **Toxic comment classification using LSTM model**

Import torch, numpy and other realted packages

In [1]:
import os
import torch
import torchtext
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch import nn
from torch.nn import functional as F

We also need a tokenizer to break the text data into tokens and then vectorize the data by building a vocabulary. This can be achieved using the torchtext package.

In [None]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

Configure our device for training and other computations.

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

Read the dataset and display the top 5 entries.

In [3]:
data = pd.read_csv("../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Separate the data and their labels

In [7]:
X = data["comment_text"]
y = data[data.columns[2:]].values

Convert the data into iterator

In [5]:
X_iter = iter(X)

Initialize the tokenizer and build the vocabulary by defining a pipeline.

In [6]:
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(X), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

Write a custom pytorch dataset class for loading the data to our model for training and validation. Each input is first vectorized and capped to a fixed maximum length of 1000 words and if the number of words is less than 1000 then it is zero padded to form a vector of length 1000. The dataset object implements a getitem method that returns a data instance and its associated label.

In [8]:
class comment_dataset(Dataset):
    def __init__(self, X, y, vocab, tokenizer, max_words = 1000):
        self.data = np.array(X)
        self.labels = np.array(y)
        self.max_words = max_words
    def __getitem__(self, index=0):
        text_data = self.data[index]
        labels = self.labels[index, :]
        text_data = vocab(tokenizer(text_data))
        text_data = text_data +([0]* (self.max_words-len(text_data))) if len(text_data)<self.max_words else text_data[:self.max_words]
        return torch.tensor(text_data, dtype = torch.int32), torch.tensor(labels, dtype = torch.float32)

    def __len__(self):
        return self.data.shape[0]

Initialize the dataset object defined above and also split the data into a train and a validation set using random_split and then define a train dataloader that will make batched from the dataset and feed it to the model during training.

In [11]:
dataset = comment_dataset(X, y, vocab = vocab, tokenizer = tokenizer)
val_size = int(0.2*len(dataset))
train_size = len(dataset) - val_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
batch_size = 60
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size = batch_size, shuffle=True)

In [12]:
vocab_size = len(vocab)

Next, write the model definition. It consists of an embedding layer, followed by an LSTM layer with 128 units and then finally a dense layer as output. The model outputs a vector of length 6 corresponding to each of the 6 classes.

In [32]:
embed_len = 50
hidden_dim = 128
n_layers=2

class LSTMClassifier(nn.Module):
    def __init__(self):
        super(LSTMClassifier, self).__init__()
        self.seq = nn.Sequential(nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_len))
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_len)
        self.lstm = nn.LSTM(input_size = embed_len, hidden_size = hidden_dim, num_layers = n_layers, batch_first = True, bidirectional = True)
        self.linear = nn.Sequential(nn.Linear(2 * hidden_dim, 128),
                                    nn.ReLU(),
                                    nn.Linear(128, 6),
                                    nn.Sigmoid())
        

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.lstm(embeddings, (torch.randn(2 * n_layers, len(X_batch), hidden_dim, device = device), torch.randn(2 * n_layers, len(X_batch), hidden_dim, device = device)))
        return self.linear(output[:,-1])


Initialize the model object

In [33]:
model = LSTMClassifier()
model = model.to(device)

Initialize binary crossentropy loss since the task at hand is a multi-label classification problem where we may get multiple class outputs. We also use Adam optimizer with a learning rate of 0.01

In [34]:
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [35]:
from tqdm import tqdm

Train the model for 20 epochs

In [36]:
epochs = 20
for i in range(epochs):
    running_loss = 0.0
    c = 0
    for data, label in tqdm(train_dataloader):
        data = data.to(device)
        label = label.to(device)
        c += 1
        optimizer.zero_grad()
        pred = model(data)
        loss_value = loss(pred, label)
        running_loss += loss_value.item()
        loss_value.backward()
        optimizer.step()
    print("Loss: {}".format(running_loss/c))

100%|██████████| 2128/2128 [07:08<00:00,  4.97it/s]


Loss: 0.14374395976144178


100%|██████████| 2128/2128 [07:09<00:00,  4.96it/s]


Loss: 0.1415056114380871


100%|██████████| 2128/2128 [07:09<00:00,  4.95it/s]


Loss: 0.14170061135874654


100%|██████████| 2128/2128 [07:09<00:00,  4.95it/s]


Loss: 0.14133674850428296


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.14120925655168362


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.1411624792341801


100%|██████████| 2128/2128 [07:10<00:00,  4.95it/s]


Loss: 0.14113805281292452


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.14107228560317167


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.1142057116910171


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.06085391862329299


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.050403507525728024


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.04495086714505442


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.04075624751326667


100%|██████████| 2128/2128 [07:11<00:00,  4.93it/s]


Loss: 0.03712931446133915


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.03397335843470856


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.03136015334552951


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]


Loss: 0.029051036354600972


100%|██████████| 2128/2128 [07:09<00:00,  4.95it/s]


Loss: 0.026950459177869493


100%|██████████| 2128/2128 [07:10<00:00,  4.95it/s]


Loss: 0.02502029775255631


100%|██████████| 2128/2128 [07:10<00:00,  4.94it/s]

Loss: 0.023210494066314227





Save the trained model

In [20]:
torch.save(model.state_dict(), "model.pth")

Save the vocabulary

In [16]:
torch.save(vocab, "vocab.pt")

Initialize validation dataloader

In [37]:
valid_loader = torch.utils.data.DataLoader(val_ds, batch_size = batch_size, shuffle=False)

Import accuracy metric from pytorch-iginite for model evaluation

In [42]:
from ignite.metrics import Accuracy
acc = Accuracy(is_multilabel=True, device = device)

Validate the model

In [43]:
acc.reset()
for data, label in train_dataloader:
    data = data.to("cuda")
    label = label.to("cuda")
    with torch.no_grad():
        pred = model(data)
    pred = torch.where(pred<0.5, 0, 1)
    acc.update((pred, label))
print("Accuracy: ", acc.compute())

Accuracy:  0.9641696107538169
