In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
from torchtext.data.utils import get_tokenizer
# from nltk import tokenizer
from torch.utils.data import DataLoader,Dataset,TensorDataset
import torchtext
import re
import random
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from torchtext.data import Field,BucketIterator,TabularDataset,LabelField

In [2]:
data=pd.read_csv("../Scripts/NLP_Toxic-comment-classification-challenge/rnn_train.csv")

In [3]:
tokenizer=get_tokenizer("basic_english")

In [4]:
# def spacy_tokenizer(text):
#     return [tok.text for tok in tokenize(text)]

In [5]:
# text=Field(tokenize=tokenizer,lower=True,include_lengths=True,init_token="<sos>",eos_token="<eos>")
# label=LabelField(dtype=torch.float)

In [6]:
TEXT=Field(tokenize=tokenizer,lower=True,include_lengths=True,batch_first=True, fix_length=200,use_vocab=True)
LABEL=LabelField(dtype=torch.float)

In [7]:


class DataFrameDataset(Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [8]:
fields=[("text",TEXT),("label",LABEL)]

In [9]:
training_data=TabularDataset(path="../Scripts/NLP_Toxic-comment-classification-challenge/rnn_train.csv",
                            format="csv",fields=fields,skip_header=True)

print(vars(training_data[4]))

{'text': ['sr', 'cl', 'you', 'bud', 'bin', 'a', 'way', '.', 'like', 'long', 'time', 'like', 'time', 'outa', 'mind', 'long', '.', 'gooood', 'shit', 'but', 'mo', "'", 'latah', 'an', 'dat', '.', 'soon', 'az', 'ah', 'gitz', 'back', 'heah', 'sum', 'a', 'de', 'brotherz', "'", 'n', 'sistahs', 'sez', 'hance', 'dead', 'or', 'alive', '?', 'we', 'are', 'about', 'to', 'send', 'a', 'zion', 'ranger', 'to', 'check', 'on', 'his', 'condition', '.', 'ah', 'sez', 'whoa', 'led', 'me', 'gib', 'him', 'a', 'holler', '.', 'dis', 'me', 'hollerin', 'bro', '.', 'de', 'zion', 'rangerz', 'be', 'dem', 'dat', 'do', 'most', 'a', "'", 'de', 'walkin', 'tru', 'wallz', '.', 'not', 'all', 'but', 'most', '.', 'de', 'wall', 'walkin', 'dude', 'ain', 'going', 'to', 'do', 'r', 'bro', 'no', 'harm', ',', 'he', 'jes', 'be', 'checkin', '.', 'ee', 'ben', 'so', 'a', 'gentle', 'remindah', 'kum', 'fuhst', '.', 'you', 'be', 'readin', 'i', 'd', '.', 'a', 'leedle', 'postin', 'a', 'dis', "'", 'n', 'dat', 'go', 'down', 'good', '.', 'doan',

In [10]:
# train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=valid_df)

In [11]:
train_data,valid_data=training_data.split(split_ratio=0.7,random_state=random.seed(2020))

In [12]:
len(train_data),len(valid_data)

(1400, 600)

In [13]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)


In [15]:
len(TEXT.vocab)

3121

In [16]:
TEXT.vocab.stoi["is"]

10

In [17]:
train_iter,valid_iter=BucketIterator.splits((train_data,valid_data),
                                         sort_key=lambda x:len(x.text),
                                         batch_size=64,
                                         sort_within_batch=True,
                                         device=device)

In [18]:
for i in train_iter:
    print(i.text[0][0])
    break

tensor([ 73,  23,  12, 910, 193,   3,  27,   4,  84,  28,  17,  78,   7,   0,
          0,   0,   2,  33,   4, 441,   7,  23,  41,   3,   4,  37,  29, 223,
         54, 171,   2,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  

In [19]:
TEXT.vocab.itos[5],TEXT.vocab.itos[0],TEXT.vocab.itos[83],TEXT.vocab.itos[31],TEXT.vocab.itos[44]

('the', '<unk>', 'people', "'", 's')

In [20]:
TEXT.vocab.itos[1]

'<pad>'

In [21]:
i=next(iter(train_iter))

In [22]:
b=[TEXT.vocab.itos[x] for x in i.text[0][0]]
b

['scum',
 'of',
 'the',
 'earth',
 'vandal',
 '.',
 'do',
 'not',
 'change',
 'my',
 'properly',
 'sourced',
 'edits',
 'you',
 'stupid',
 'prick',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '

In [23]:
print(TEXT.vocab.freqs.most_common(10))

[('.', 6665), (',', 3687), ('you', 3496), ('the', 3349), ('i', 2954), ('to', 2373), ('a', 1835), ('and', 1758), ('is', 1735), ('of', 1686)]


In [24]:
len(TEXT.vocab)

3121

In [25]:
class LSTM(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,n_layers,output_dim,dropout):
        #Constructor
        super().__init__()          
        
        self.dropout=nn.Dropout(0.3)
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=True, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        text=self.dropout(text)
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=F.relu(self.fc(hidden))

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs
        

    
    

In [26]:
vocab_size=len(TEXT.vocab)
embedding_dim=100
hidden_dim=32
n_layers=2
output_dim=1
dropout=0.2
model=LSTM(vocab_size,embedding_dim,hidden_dim,n_layers,output_dim,dropout)
model

LSTM(
  (dropout): Dropout(p=0.3, inplace=False)
  (embedding): Embedding(3121, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)

In [27]:
#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')
pretrained_embeddings=TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

The model has 371,557 trainable parameters
torch.Size([3121, 100])


In [28]:
optimizer=torch.optim.Adam(model.parameters())
criterion=nn.BCEWithLogitsLoss()

def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
    
model=model.to(device)
criterion=criterion.to(device)

In [None]:

for batch in train_iter:
    text,text_len=batch.text
    label=batch.label
    
    output=model(text,text_len)
    rounded_preds = torch.round(torch.sigmoid(output))
    
    correct = (rounded_preds == label).float()
    acc = correct.sum() / len(correct)
    
    print(output)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze(1)  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze(1)
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '../Scripts/NLP_Toxic-comment-classification-challenge/saved_weights2.pt')
    
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [106]:
path='../Scripts/NLP_Toxic-comment-classification-challenge/saved_weights2.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
nlp =spacy.load("en_core_web_sm")

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()      

In [110]:
print(predict(model,"fuck you people . the information of that can be found here"))

0.02778860181570053


# Adding DataLoader

In [100]:
for epoch in range(5):
    epoch_loss=0
    epoch_acc=0
    model.train()
    for batch in train_iter:
        optimizer.zero_grad()
        
        text,text_length=batch.text
        
        output=model(text,text_length)
        
        loss=criterion(output,batch.label)
        
        loss.backward()
        
        optimizer.step()
        
#         acc=((output.argmax(1) == batch.label).float().mean())
        
#         epoch_acc+=acc / len(acc)
        rounded_preds = torch.round(output)
    
        correct = (rounded_preds == batch.label).float() 
        acc = correct.sum() / len(correct)
        
        epoch_loss+=loss / len(train_iter)
        
    print('Epoch : {}, train accuracy : {}, train loss : {},'.format(epoch+1, acc,epoch_loss,))

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch : 1, train accuracy : 40.25, train loss : 0.6873973608016968,
Epoch : 2, train accuracy : 34.5, train loss : 0.6586592197418213,
Epoch : 3, train accuracy : 33.09375, train loss : 0.5151001214981079,
Epoch : 4, train accuracy : 33.0, train loss : 0.4264102876186371,
Epoch : 5, train accuracy : 38.90625, train loss : 0.336434006690979,
