In [None]:
from google.colab import drive
drive.mount("/content/drive/")

In [None]:
!pip install transformers

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys   
from sklearn.model_selection import train_test_split     
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("/content/drive/MyDrive/train.csv")

In [None]:
train_df,val_df=train_test_split(df,test_size=0.05)
train_df.shape,val_df.shape

In [None]:
train_df.head(5)

In [None]:
LABEL_COLUMNS=['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

In [None]:
train_df[LABEL_COLUMNS].sum()

In [None]:
sns.barplot(train_df[LABEL_COLUMNS])

In [None]:
train_toxic=train_df[train_df[LABEL_COLUMNS].sum(axis=1)>0]
train_clean=train_df[train_df[LABEL_COLUMNS].sum(axis=1)==0]

In [None]:
train_df=pd.concat([train_toxic,train_clean.sample(15000)])

In [None]:
train_df=train_df.drop('id',axis=1)
val_df=val_df.drop('id',axis=1)

In [None]:
val_df.head(10)

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 0.00001

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index:int):
        data_row=self.df.iloc[index]
        comment_text=data_row.comment_text
        targets=data_row[LABEL_COLUMNS]
        inputs = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(targets)
        }

In [None]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 6)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets=[]
val_outputs=[]

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer):
  for epoch in range(1,n_epochs+1):
    train_loss=0
    valid_loss=0
    model.train()
    print("Epoch {}".format(epoch))
    for batch_idx,data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        print("Trainng end epoch {}".format(epoch))
        print("Validation Start epoch {}".format(epoch))
        model.eval()
        with torch.no_grad():
          for batch_idx,data in enumerate(validation_loader,0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
          print("Validation end epoch {}".format(epoch))
          train_loss = train_loss/len(training_loader)
          valid_loss = valid_loss/len(validation_loader)
          print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
  return model

In [None]:
trained_model = train_model(EPOCHS,train_data_loader,val_data_loader,model,optimizer)

In [None]:
example="You are one of the best person I ever seen in my life that became devastating to my all expectations. Next time if I see your beautiful face I will frighen hell by your death."
encodings = tokenizer.encode_plus(
    example,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(example)
    print("Emotion",":",train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])