In [None]:
#Imports
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
#Import data
toxic_train = pd.read_csv('data/train.csv')
toxic_train

In [None]:
#Clean data
toxic_train = toxic_train.loc[(toxic_train['toxic'] == 1) | 
                    (toxic_train['severe_toxic'] == 1)|
                    (toxic_train['obscene'] == 1)|
                    (toxic_train['threat'] == 1)|
                    (toxic_train['insult'] == 1)|
                    (toxic_train['identity_hate'] == 1)]
try:
    toxic_train = toxic_train.drop(['id'], axis=1)
    toxic_train = toxic_train.to_numpy()
except:
    print('ID column already dropped.')
finally:
    toxic_data, toxic_classes = toxic_train[:,0], toxic_train[:,1:]
toxic_data

In [None]:
#Vectorize data
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(toxic_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
text_pipeline = lambda x: vocab(tokenizer(x))

for i, x in np.ndenumerate(toxic_data):
    toxic_data[i] = np.array(text_pipeline(toxic_data[i]))

toxic_data = np.array(toxic_data)
toxic_data

In [None]:
from numpy import float64

class CustomDataset(Dataset):
    def __init__(self, data, classes):
        self.data = data
        self.classes = classes

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.LongTensor((self.data[idx])), torch.tensor(self.classes[idx].astype(float64)).type(torch.FloatTensor)

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embed_layer =  nn.EmbeddingBag(vocab_size+1, embed_dim)
        self.lstm = nn.LSTM(embed_dim, 20, bidirectional=True) # nn.Tanh()
        self.linear_1 = nn.Linear(40, 32) # nn.ReLU()
        self.linear_2 = nn.Linear(32, 16) # nn.ReLU()
        self.linear_3 = nn.Linear(16, 6) # nn.Sigmoid()
        self.t = nn.Tanh()
        self.r = nn.ReLU()
        self.s = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embed_layer(x)
        out, _ = self.lstm(embedded)
        out = self.t(out)
        out = self.r(self.linear_1(out))
        out = self.r(self.linear_2(out))
        out = self.s(self.linear_3(out))
        return out


In [None]:
VOCAB_SIZE = len(vocab)
EMBED_DIM = 64
model = Model(VOCAB_SIZE, EMBED_DIM)
dataset = CustomDataset(toxic_data, toxic_classes)
dataloader = DataLoader(dataset, shuffle=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
epochs = 15

In [None]:
model.train()
total_acc, total_count = 0, 0
log_interval = 1600
start_time = time.time()


for epoch in range(epochs):
    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        l = loss
        loss.backward()
        optimizer.step()
        if torch.all((predicted_label>0.5).float().eq(label)):
            total_acc+=1
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} | loss {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count, l
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()




In [90]:
torch.save(model,'saved_model.pt')
torch.save(vocab, 'saved_vocab.pth')

In [88]:
test = "wagwan broski I love you"
def GetPrediction(text):
    response = []
    classes = ['Toxic','Severe Toxic', 'Obscene', 'Threat', 'Insult', 'Identity Hate']
    text = text_pipeline(text)
    text = torch.LongTensor(text)
    text = text[None,:]
    pred = model(text)[0]
    results = (pred>0.8).nonzero()
    for r in results:
        response.append(classes[r[0].item()])
    return response, pred

GetPrediction(test)

(['Toxic', 'Obscene'],
 tensor([9.8353e-01, 2.5800e-06, 8.9814e-01, 2.2236e-12, 1.3120e-03, 9.0744e-07],
        grad_fn=<SelectBackward0>))

In [93]:
type(vocab)

torchtext.vocab.vocab.Vocab