In [None]:
import pandas as pd 
import re 
import spacy 
from torchtext.vocab import FastText
from collections import Counter
import torch 
from torch.utils.data import Dataset 

#from model import ToxicClassifier 


data = pd.read_csv(r'C:\Users\ritth\code\Strive\toxic-detection-challenge\train.csv\train.csv')
# print(data.head(10))
# print(data.columns)
data = data.drop('id', axis=1)
# print(data)

def data_engineering(data):
     column_name, no, yes = [], [], []
     for i in data.columns:
          if i == 'comment_text':
               continue
          else:
               column_name.append(i),   no.append(data[i].value_counts()[0]),   yes.append(data[i].value_counts()[1])
     return column_name, no, yes

# column_name, no, yes = data_engineering(data)

# print(column_name)
# print(no)
# print(yes)

# print('BEFORE')
# print(data['comment_text'][3])

def preprocessing(data):
     # data = data 
     data['comment_text'] = data['comment_text'].apply(lambda text: text.lower()) 
     data['comment_text'] = data['comment_text'].apply(lambda text: text.strip()) 
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('\n', ' ', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub(',', ' ', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('.', ' ', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('\'', '', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('"', '', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('! ', '', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('/', '', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('-', ' ', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('=', ' ', text))
     # data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('(', ' ', text))
     # data['comment_text'] = data['comment_text'].apply(lambda text: re.sub(')', ' ', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub('\w*\d\w*\*', ' ', text))
     data['comment_text'] = data['comment_text'].apply(lambda text: re.sub(r'[^\x00-\x7f]', r' ', text)) 
     data['comment_text'] = data['comment_text'].apply(lambda text: text.strip())
     return data 

# print()
# print('AFTER')
data = (preprocessing(data))
# print(data['comment_text'][3])

print(len(data))

# print(data.head())

def shuffle_data(dataset):
     return dataset.sample(frac=1).reset_index(drop=True)

data = shuffle_data(data)
# print(data.head())



def split_data(data):
     data_toxic          = data[['comment_text', 'toxic']]
     data_severe_toxic   = data[['comment_text', 'severe_toxic']]
     data_obscene        = data[['comment_text', 'obscene']]
     data_threat         = data[['comment_text', 'threat']]
     data_insult         = data[['comment_text', 'insult']]
     data_identity_hate  = data[['comment_text', 'identity_hate']]
     return data_toxic, data_severe_toxic, data_obscene, data_threat, data_insult, data_identity_hate 


# data_toxic, data_severe_toxic, data_obscene, data_threat, data_insult, data_identity_hate = split_data(data) 

# print()
# print(data_toxic.head(5))
# print()
# print(data_severe_toxic.head(5))
# print()
# print(data_obscene.head(5))
# print()
# print(data_threat.head(5))
# print()
# print(data_insult.head(5))
# print()
# print(data_identity_hate.head(5))



nlp = spacy.load("en_core_web_sm")

def preprocessing(sentence):
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens




# data_toxic_few['comment_text'] = data_toxic_few['comment_text'].apply(lambda text: preprocessing(text))
# print(data_toxic_few)



def token_encoder(token, vec):
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]
        except:
            return 0

def encoder(tokens, vec):
    return [token_encoder(token, vec) for token in tokens]


def front_padding(list_of_indexes, max_seq_len, padding_index=0):
    new_out = (max_seq_len - len(list_of_indexes))*[padding_index] + list_of_indexes
    return new_out[:max_seq_len]  


fasttext = FastText("simple")


max_seq_length = 50

class TrainData(Dataset):
    def __init__(self, data, data_target, max_seq_len=max_seq_length): # data is the input data, max_seq_len is the max lenght allowed to a sentence before cutting or padding
        self.max_seq_len = max_seq_len
        
        counter = Counter()
        train_iter = iter(data['comment_text'].values)
        self.vec = FastText("simple")

        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0]) # replacing the vector associated with 1 (padded value) to become a vector of -1.
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0]) # replacing the vector associated with 0 (unknown) to become zeros
        self.vectorizer = lambda x: self.vec.vectors[x]

        self.target = data[data_target]
        features = [front_padding(encoder(preprocessing(sequence), self.vec), max_seq_len) for sequence in data['comment_text'].tolist()]
        self.features = features
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, i):
        assert len(self.features[i]) == self.max_seq_len
        return self.features[i], self.target[i]

In [None]:
from torch import nn
import torch.nn.functional as F
emb_dim = 300

class ToxicClassifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden=32):
        super(ToxicClassifier, self).__init__()
        self.input_layer   = nn.Linear(max_seq_len*emb_dim, hidden)
        self.first_hidden  = nn.Linear(hidden, hidden)
        self.second_hidden = nn.Linear(hidden, hidden)
        self.third_hidden  = nn.Linear(hidden, 2)
        self.output        = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, inputs):
        x = F.relu(self.input_layer(inputs.squeeze(1).float()))
        x = self.dropout(F.relu(self.first_hidden(x)))
        x = self.dropout(F.relu(self.second_hidden(x)))
        x = self.third_hidden(x)

        return self.output(x)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
import torch 
from torch.utils.data import DataLoader, Dataset 
from data_handler import split_data, TrainData, data 
from model import ToxicClassifier 
from sklearn.metrics import accuracy_score 
import matplotlib.pyplot as plt 


length_of_data = 159571 
max_seq_length = 64 
idx = int(0.7 * length_of_data)


#####################################################################################################################################################
#####################################################################################################################################################

columns_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
data_toxic, data_severe_toxic, data_obscene, data_threat, data_insult, data_identity_hate = split_data(data)  

train_data = data_threat.iloc[ :100 ].reset_index(drop=True)
test_data  = data_threat.iloc[100: ].reset_index(drop=True)

dataset_train = TrainData(train_data, data_target='threat', max_seq_len=max_seq_length)
dataset_test  = TrainData(test_data,  data_target='threat', max_seq_len=max_seq_length)


#####################################################################################################################################################
#####################################################################################################################################################


def collation_train(batch, vectorizer=dataset_train.vectorizer):
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
    target = torch.LongTensor([item[1] for item in batch]) 
    return inputs, target

def collation_test(batch, vectorizer=dataset_test.vectorizer):
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
    target = torch.LongTensor([item[1] for item in batch]) 
    return inputs, target

train_loader = DataLoader(dataset_train, batch_size=32, collate_fn=collation_train, shuffle=True)
test_loader  = DataLoader(dataset_test,  batch_size=32, collate_fn=collation_test)

#####################################################################################################################################################
#####################################################################################################################################################



from torch import nn
import torch.optim as optim 

emb_dim = 300

model = ToxicClassifier(max_seq_len=max_seq_length, emb_dim=emb_dim, hidden=32)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)



epochs = 30
all_train_losses, all_test_losses, all_accuracies = [],  [], []

for e in range(epochs):
     train_losses, test_losses, running_accuracy = 0, 0, 0

     for i, (sentences_train, labels_train) in enumerate(iter(train_loader)):
          
          i = i.to(device)
          sentences_train = sentences_train.to(device)
          labels_train = labels_train.to(device)

          # print(sentences_train.shape)
          sentences_train.resize_(sentences_train.size()[0], max_seq_length * emb_dim)

          optimizer.zero_grad()
          prediction_train = model.forward(sentences_train)   
          loss_train = criterion(prediction_train, labels_train) 
          loss_train.backward()                  
          optimizer.step()                

          train_losses += loss_train.item()
     
     avg_train_loss = train_losses/len(train_loader)
     all_train_losses.append(avg_train_loss)


     model.eval()
     with torch.no_grad():
          for i, (sentences_test, labels_test) in enumerate(iter(test_loader)):

               i = i.to(device)
               sentences_test = sentences_test.to(device)
               labels_test = labels_test.to(device)
               sentences_test.resize_(sentences_test.size()[0], max_seq_length * emb_dim)

               prediction_test = model.forward(sentences_test) 
               loss_test = criterion(prediction_test, labels_test) 

               test_losses += loss_test.item()


               prediction_class = torch.argmax(prediction_test, dim=1)
               running_accuracy += accuracy_score(labels_test, prediction_class)
          
          avg_test_loss = test_losses/len(test_loader)
          all_test_losses.append(avg_test_loss)

          avg_running_accuracy = running_accuracy/len(test_loader)
          all_accuracies.append(avg_running_accuracy)


     model.train()


     print(f'Epoch  : {e+1:3}/{epochs}    |   Train Loss:  : {avg_train_loss:.8f}     |  Test Loss:  : {avg_test_loss:.8f}  |  Accuracy  :   {avg_running_accuracy:.4f}')

torch.save({ "model_state": model.to('cpu').state_dict(), 'max_seq_len' : 64, 'emb_dim' : 64, 'hidden1' : 32, 'hidden2' : 32}, 'trained_model_THREAT2')

plt.plot(all_train_losses, label='Train Loss')
plt.plot(all_test_losses,  label='Test Loss')
plt.plot(all_accuracies,   label='Accuracy')

plt.legend()
plt.show()
