<a href="https://colab.research.google.com/github/PremKumar-V/NLP_Projects/blob/main/Toxic_Comment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle --quiet

In [None]:
import os

os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

In [None]:
!unzip jigsaw-toxic-comment-classification-challenge.zip -d data

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('/content/data/train.csv.zip')
test = pd.read_csv('/content/data/test.csv.zip')
sub = pd.read_csv('/content/data/sample_submission.csv.zip')

In [None]:
train.head(5)

In [None]:
targetCols = [i for i in train.columns.tolist()][2:]
targetCols

In [None]:
for values in targetCols:
    print(train[values].value_counts(normalize = True))

In [None]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

In [None]:
VOCAB_SIZE = 1500
unk_token = '<unk>'
pad_token = '<pad>'

In [None]:
commentsTokens = train['comment_text'].map(tokenizer)

In [None]:
vocab = build_vocab_from_iterator(commentsTokens, specials = [unk_token, pad_token], max_tokens = VOCAB_SIZE)
vocab.set_default_index(vocab[unk_token])

In [None]:
train.comment_text.sample(1000).map(tokenizer).map(len).plot(kind='hist');

In [None]:
MAX_LENGTH = 150

def padTokens(tokens):
    if (len(tokens) >= MAX_LENGTH):
        return tokens[:MAX_LENGTH]
    else:
        return tokens + [pad_token] * (MAX_LENGTH - len(tokens))

In [None]:
import torch
from torch.utils.data import Dataset, random_split

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, df, isTest = False):
        self.df = df
        self.isTest = isTest

    def __getitem__(self, index):
        commentText = self.df['comment_text'].values[index]
        commentTokens = padTokens(tokenizer(commentText))
        input = torch.tensor(vocab.lookup_indices(commentTokens))

        if self.isTest:
            target = torch.tensor([0, 0, 0, 0, 0, 0]).float()
        else:
            target = torch.tensor(self.df[targetCols].values[index]).float()

        return input, target

    def __len__(self):
        return len(self.df)

In [None]:
rawDataset = ClassificationDataset(train)

In [None]:
rawDataset[0]

In [None]:
VAL_FRAC = 0.25

In [None]:
trainDataset, valDataset = random_split(rawDataset, [1-VAL_FRAC, VAL_FRAC])

In [None]:
testDataset = ClassificationDataset(test, isTest = True)

testDataset[0]

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 256

In [None]:
trainDl = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=True)
valDl = DataLoader(valDataset, batch_size=BATCH_SIZE*2, num_workers=8, pin_memory=True)
testDl = DataLoader(testDataset, batch_size=BATCH_SIZE*2, num_workers=8, pin_memory=True)

In [None]:
for batch in trainDl:
    input, target = batch
    print(f"Inputs Shape: {input.shape}")
    print(f"Targets Shape: {target.shape}")
    break

In [None]:
import torch.nn as nn
import torch.functional as f

In [None]:
embLayer = nn.Embedding(VOCAB_SIZE, 256, 1)

In [None]:
rnnLayer = nn.RNN(256, 128, 1, batch_first=True)

In [None]:
for batch in trainDl:
    input, target = batch
    print('Input.shape', input.shape)
    print('Target.shape', target.shape)

    embOut = embLayer(input)
    print('Embedding shape', embOut.shape)

    rnnOut, hn = rnnLayer(embOut)
    print('RNN shape', rnnOut.shape)
    print('Hidden shape', hn.shape)

    break

In [None]:
!pip install pytorch_lightning --quiet

In [None]:
import torch.nn.functional as F
import pytorch_lightning as pl
import numpy as np

In [None]:
class ExtractTensor(nn.Module):
    def forward(self,x):
        tensor, _ = x
        return tensor[:, -1, :]


In [None]:
class ClassificationModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(VOCAB_SIZE, 256, 1)
        self.lstm = nn.LSTM(256, 128, 1, batch_first=True)
        self.linear = nn.Linear(128, 6)
        self.learning_rate = 0.001
        self.validation_step_outputs = []

    def forward(self, x):
        out = self.emb(x)
        out, hn = self.lstm(out)
        out = F.relu(out[:,-1,:])
        out = self.linear(out)
        return out

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs)
        loss = F.binary_cross_entropy(probs, targets)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs)
        loss = F.binary_cross_entropy(probs, targets)
        self.validation_step_outputs.append(loss)
        return loss.item()

    def on_validation_epoch_end(self):
        loss = torch.stack(self.validation_step_outputs).mean()
        print("Epoch #{}; Loss: {:4f} ".format(self.current_epoch, loss))
        self.validation_step_outputs.clear()

#         epoch_average = torch.stack(self.validation_step_outputs).mean()
#          self.log("validation_epoch_average", epoch_average)
# +        self.validation_step_outputs.clear()  # free memory

    def predict_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs)
        return probs

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [None]:
model = ClassificationModel()

for batch in trainDl:
    input, target = batch
    print('Inputs.shape', input.shape)
    print('targets.shape', target.shape)

    outputs = model(input)
    print('outputs shape', outputs.shape)

    probs = torch.sigmoid(outputs)
    loss = F.binary_cross_entropy(probs, target)
    print('Loss', loss)
    break

In [None]:
path = './model.pth'

In [None]:
if os.path.exists(path):
    model = torch.load('model.pth')
else:
    trainer = pl.Trainer(max_epochs=3, accelerator='gpu')
    trainer.fit(model, trainDl, valDl)

In [None]:
import torch
torch.save(model, 'model.pth')

# Load the entire model
# model = torch.load('model.pth')
# model.eval()  # Put the model in evaluation mode after loading