In [1]:
import os
import pandas as pd
import tqdm
!pip install transformers --quiet
import transformers
from transformers import DistilBertModel, DistilBertConfig,DistilBertTokenizerFast, BertTokenizer, BertModel,AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
!pip install pytorch-lightning --quiet
import pytorch_lightning as py_light
from pytorch_lightning.metrics.functional import accuracy, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

[K     |████████████████████████████████| 2.1MB 5.3MB/s 
[K     |████████████████████████████████| 3.3MB 17.6MB/s 
[K     |████████████████████████████████| 901kB 38.9MB/s 
[K     |████████████████████████████████| 849kB 5.8MB/s 
[K     |████████████████████████████████| 276kB 36.9MB/s 
[K     |████████████████████████████████| 829kB 28.9MB/s 
[K     |████████████████████████████████| 112kB 38.8MB/s 
[K     |████████████████████████████████| 184kB 36.5MB/s 
[K     |████████████████████████████████| 1.3MB 34.6MB/s 
[K     |████████████████████████████████| 296kB 41.3MB/s 
[K     |████████████████████████████████| 143kB 39.6MB/s 
[?25h  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
  Building wheel for future (setup.py) ... [?25l[?25hdone


In [2]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/Dataset')

In [3]:
#!unzip jigsaw-toxic-comment-classification-challenge.zip

In [4]:
dataset = pd.read_csv('train.csv.zip')

In [5]:
dataset.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [6]:
dataset.tail(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159561,ffd2e85b07b3c7e4,"""\nNo he did not, read it again (I would have ...",0,0,0,0,0,0
159562,ffd72e9766c09c97,"""\n Auto guides and the motoring press are not...",0,0,0,0,0,0
159563,ffe029a7c79dc7fe,"""\nplease identify what part of BLP applies be...",0,0,0,0,0,0
159564,ffe897e7f7182c90,Catalan independentism is the social movement ...,0,0,0,0,0,0
159565,ffe8b9316245be30,The numbers in parentheses are the additional ...,0,0,0,0,0,0
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [7]:
#checking if there are any missing values in the dataset
dataset.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [8]:
#total number of sentences in the dataset
dataset.shape

(159571, 8)

In [9]:
#counting number of comments per class
classes = ['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']
dataset[classes].sum()

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

# Observation 
Imbalance in the number of samples per classes detected. i.e number of toxic comments are much greater than others.
Let's check the number of texts with zero toxic vs toxic(i.e contains either of one toxic class too)

In [10]:
toxic_comments = dataset[dataset[classes].sum(axis=1)>0]
clean_comments = dataset[dataset[classes].sum(axis=1)==0]

print("Number of toxic comments", len(toxic_comments))
print("Number of clean comments", len(clean_comments))

Number of toxic comments 16225
Number of clean comments 143346


#Observation
Number of clean text is 10 times more than number of toxic.
Thus, we will sample comments equal to number of toxic comments from the clean comments.

In [11]:
uniform_dataset = pd.concat([toxic_comments, clean_comments.sample(len(toxic_comments))])

In [12]:
#checking again
toxic_comments = uniform_dataset[uniform_dataset[classes].sum(axis=1)>0]
clean_comments = uniform_dataset[uniform_dataset[classes].sum(axis=1)==0]

print("Number of toxic comments", len(toxic_comments))
print("Number of clean comments", len(clean_comments))

Number of toxic comments 16225
Number of clean comments 16225


In [13]:
#splitting the dataset into train and val split
train_dataset, val_dataset = train_test_split(uniform_dataset,test_size=0.2, shuffle=True)

In [14]:
train_dataset.iloc[0]

id                    ab042d8df42b0b92
comment_text     YOUR MOM's Va jay jay
toxic                                1
severe_toxic                         0
obscene                              0
threat                               0
insult                               0
identity_hate                        0
Name: 154244, dtype: object

In [15]:
print("shape of training dataset", train_dataset.shape)
print("shape of validation dataset", val_dataset.shape)

shape of training dataset (25960, 8)
shape of validation dataset (6490, 8)


In [16]:
#initialising the tokenizer and model of DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
#initialising the tokenizer and model of BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model1 = BertModel.from_pretrained('bert-base-uncased')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [17]:
#defining the data class which inherits the Dataset class
class CustomDataset(Dataset):
  #defining the constructor
  def __init__(self, dataframe, tokenizer, max_token_len=128):
    self.dataframe = dataframe
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.dataframe)

  #defining the getter function
  def __getitem__(self,idx):
    comment = self.dataframe.iloc[idx]
    #fetching the text and label, ignoring the id
    text, label = comment.comment_text, comment[classes]
    #encoding the text using the Distilbert tokenizer
    encoded_text = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length= self.max_token_len, return_token_type_ids =False, padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
    return dict(
        input_text= text,
        input_ids = encoded_text['input_ids'].flatten(),
        attention_mask = encoded_text['attention_mask'].flatten(),
        label = torch.FloatTensor(label)
     )

In [18]:
#validating the dataset
train_data = CustomDataset(train_dataset,bert_tokenizer,128)

In [19]:
import random
idx = random.randint(0, len(train_data)-1)
sample_data = train_data[idx]
print(sample_data) 
print(sample_data.keys())

{'input_text': '"\n\n Ban of ""Bryansee"" from Wikipediocracy. \n\nHey, you are Zoloft. The one who banned me from Wikipediocracy with a threat that I die. ""Well"" means dead. ""Recover"" means ""die"". You are wanting me to die by a medication increase or meet my maker. Check this out:  103.244.189.136  "', 'input_ids': tensor([  101,  1000,  7221,  1997,  1000,  1000,  8527, 19763,  1000,  1000,
         2013, 15536,  3211,  5669,  3695, 26775, 15719,  1012,  4931,  1010,
         2017,  2024,  1062, 12898,  6199,  1012,  1996,  2028,  2040,  7917,
         2033,  2013, 15536,  3211,  5669,  3695, 26775, 15719,  2007,  1037,
         5081,  2008,  1045,  3280,  1012,  1000,  1000,  2092,  1000,  1000,
         2965,  2757,  1012,  1000,  1000,  8980,  1000,  1000,  2965,  1000,
         1000,  3280,  1000,  1000,  1012,  2017,  2024,  5782,  2033,  2000,
         3280,  2011,  1037, 14667,  3623,  2030,  3113,  2026,  9338,  1012,
         4638,  2023,  2041,  1024,  9800,  1012, 24

In [20]:
#creating datamodule (train agnostic)
class TrainDataModule(py_light.LightningDataModule):
  #defining the constructor
  def __init__(self, train_data, val_data,tokenizer, batch=12, max_token_len=128):
    #initialising the parent class
    super().__init__()
    self.train_data = train_data
    self.val_data = val_data
    self.batch = batch
    self.max_token_len = max_token_len
    self.tokenizer = tokenizer
  
  def setup(self, stage=None):
    self.train_data = CustomDataset(self.train_data, self.tokenizer,self.max_token_len)
    self.val_data = CustomDataset(self.val_data, self.tokenizer,self.max_token_len)
    
  #defining the train data loader method
  def train_dataloader(self):
    return DataLoader(self.train_data, batch_size= self.batch, shuffle = True) #, num_workers = 12

  #defining val data loader method
  def val_dataloader(self):
    return DataLoader(self.val_data, batch_size= self.batch, shuffle = False)

In [21]:

#checking the data module
data_module = TrainDataModule(train_dataset, val_dataset,bert_tokenizer,12, 128)
data_module.setup()

In [22]:
#defining the backbone model of DistilBert to be finetuned.
class Model_Finetune(py_light.LightningModule):
  #defining the constructor
  def __init__(self, num_classes, steps_per_epoch=None, epochs=None):
    super().__init__()
    #self.save_hyperparameters()

    self.model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
    self.final_layer = nn.Linear(768, num_classes) # source: https://huggingface.co/transformers/model_doc/distilbert.html
    self.steps_per_epoch = steps_per_epoch
    self.epochs  = epochs
    self.loss_function = nn.BCELoss()
  #defining the forward pass
  def forward(self, input_ids, attention_mask, labels=None):
    out = self.model(input_ids, attention_mask)
    out = self.final_layer(out.pooler_output)
    out = torch.sigmoid(out)
    loss = 0
    if labels is not None:
      loss = self.loss_function(out, labels)
    return loss,out

  #defining the training step
  def training_step(self,batch, batch_number):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    loss,out = self(input_ids, attention_mask, labels)
    self.log("training loss" , loss, prog_bar = True, logger=True)
    return {"loss": loss, "preds": out, "labels": labels}   

  #defining the validation step
  def validation_step(self,batch, batch_number):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    loss,out = self.forward(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar = True, logger=True)
    return loss
  
  #calculating ROC-AUC on epoch end:
  def training_epoch_end(self, output):
    Labels = []
    Preds = []
    for out in output:
      for labels in output["labels"].detach().cpu():
        Labels.append(labels)

      for pred in output["preds"].detach().cpu():
        Preds.append(pred)
    Labels = torch.stack(Labels).int()
    Preds = torch.stack(Preds)

    for idx, label in enumerate(classes):
      roc_score = auroc(Preds[:,idx], Labels[:,idx])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", roc_score, self.current_epoch)
  
  # method for configuring optimizer
  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=1e-4)
    warmup_steps = self.steps_per_epoch//4
    total_steps = self.steps_per_epoch + self.epochs - warmup_steps

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        warmup_steps,
        total_steps
    )

    return [optimizer], [scheduler]


In [33]:
finetune_model = Model_Finetune(num_classes=6, steps_per_epoch =1 ,epochs= 1) #len(train_dataset)//8

In [37]:
 #checking model
 loss_function = nn.BCELoss()
 _, predictions = finetune_model(sample_data['input_ids'].unsqueeze(dim=0), sample_data['attention_mask'].unsqueeze(dim=0))
 loss_function(predictions, sample_data['label'].unsqueeze(dim=0))

tensor(0.6496, grad_fn=<BinaryCrossEntropyBackward>)

In [38]:
#configuring checkpoint 
from pytorch_lightning.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(
  dirpath="saved_model",
  filename="trained_model",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)




In [39]:
trainer = py_light.Trainer(max_epochs = 1,checkpoint_callback=checkpointer, gpus=1, progress_bar_refresh_rate=30)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(finetune_model,data_module)

In [42]:
#loading the saved model
trained_model = Model_Finetune.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  num_classes=len(classes))
trained_model.eval()
trained_model.freeze()

# Evaluating the model


In [43]:
#toxic
text1 = "Please go away, you're no longer needed here. Run!!!"
#non-toxic
text2 = "Hi There, welcome to the demo"
#generating the embeddings for both
embedding_text1 = tokenizer.encode_plus(
  text1,
  add_special_tokens=True,
  max_length=128,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)
embedding_text2 = tokenizer.encode_plus(
  text2,
  add_special_tokens=True,
  max_length=128,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

In [44]:
_, pred1 = trained_model(embedding_text1["input_ids"], embedding_text1["attention_mask"])
pred1= pred1.flatten().numpy()

for label, prediction in zip(classes, pred1):
  print(f"{label}: {prediction}")

toxic: 0.7098094820976257
severe_toxic: 0.004845251329243183
obscene: 0.020413745194673538
threat: 0.036327604204416275
insult: 0.09445533156394958
identity_hate: 0.010295387357473373


In [45]:
_, pred2 = trained_model(embedding_text2["input_ids"], embedding_text2["attention_mask"])
pred2= pred2.flatten().numpy()

for label, prediction in zip(classes, pred2):
  print(f"{label}: {prediction}")

toxic: 0.011011285707354546
severe_toxic: 0.001138195744715631
obscene: 0.003250479232519865
threat: 0.0007463436340913177
insult: 0.0028311400674283504
identity_hate: 0.0012092568213120103
