In [1]:
#Imports
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
#Import data
toxic_train = pd.read_csv('data/train.csv')
toxic_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [3]:
#Clean data
toxic_train_c = toxic_train.loc[(toxic_train['toxic'] == 0) &
                    (toxic_train['severe_toxic'] == 0)&
                    (toxic_train['obscene'] == 0)&
                    (toxic_train['threat'] == 0)&
                    (toxic_train['insult'] == 0)&
                    (toxic_train['identity_hate'] == 0)]

toxic_train_t = toxic_train.loc[(toxic_train['toxic'] == 1) | 
                    (toxic_train['severe_toxic'] == 1)|
                    (toxic_train['obscene'] == 1)|
                    (toxic_train['threat'] == 1)|
                    (toxic_train['insult'] == 1)|
                    (toxic_train['identity_hate'] == 1)]

frames = [toxic_train_c[:30000-toxic_train_t.shape[0]], toxic_train_t]
toxic_train = pd.concat(frames)

try:
    toxic_train = toxic_train.drop(['id'], axis=1)
    toxic_train = toxic_train.to_numpy()
except:
    print('ID column already dropped.')
finally:
    toxic_data, toxic_classes = toxic_train[:,0], toxic_train[:,1:]
toxic_data

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       "Your absurd edits \n\nYour absurd edits on great white shark was total vandalism and was very sexual. All you edit here is fucking bullshit like spam all over this useful encyclopedia so stop all your bullshit. The admins have you everywhere. The only choice for you is to stop this bullshit or else you'll be blocked permanently. User:F

In [4]:
#Vectorize data
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(toxic_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
text_pipeline = lambda x: vocab(tokenizer(x))

for i, x in np.ndenumerate(toxic_data):
    toxic_data[i] = np.array(text_pipeline(toxic_data[i]))

toxic_data = np.array(toxic_data)
toxic_data

array([array([  887,    77,     3,   144,   159,   268,    30,   890,  4270,
              10073,  1252,   110,   433,    16,    60,  2111,     9,    26,
               8930,     2,    55,  9089,    22,    81,  3761,   194,     6,
               4334,    52,   170,  1285, 15376,  4082,     1,    10,    69,
                 56,     9,    26,   314,     3,   565,    50,     3,    59,
                 42,   192,     6,     9,    76,  3704,   101,     1,  2603,
                  1,  3346,     1,  2492,     1,  1338])                    ,
       array([  177,     9, 11526,     4,    54,  3128,    17,  1923,  6586,
                  6,     9,    76,  4309,  2327,    32,     1,   135,     1,
                 27,    59,    23,   990,  3099,     2,  1496,   898,     2,
               8383,    27,   241,    23])                                  ,
       array([ 238,  293,    2,    6,    9,   76,  137,   21,  265,    7,   99,
               395,    1,   14,    9,   28,   55,   13,   17,  438,   1

In [5]:
class CustomDataset(Dataset):
    def __init__(self, data, classes):
        self.data = data
        self.classes = classes

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.LongTensor((self.data[idx])), torch.tensor(self.classes[idx].astype(np.float64)).type(torch.FloatTensor)

In [6]:
class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embed_layer =  nn.EmbeddingBag(vocab_size+1, embed_dim)
        self.lstm = nn.LSTM(embed_dim, 20, bidirectional=True) # nn.Tanh()
        self.linear_1 = nn.Linear(40, 32) # nn.ReLU()
        self.linear_2 = nn.Linear(32, 16) # nn.ReLU()
        self.linear_3 = nn.Linear(16, 6) # nn.Sigmoid()
        self.t = nn.Tanh()
        self.r = nn.ReLU()
        self.s = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embed_layer(x)
        out, _ = self.lstm(embedded)
        out = self.t(out)
        out = self.r(self.linear_1(out))
        out = self.r(self.linear_2(out))
        out = self.s(self.linear_3(out))
        return out


In [7]:
VOCAB_SIZE = len(vocab)
EMBED_DIM = 64
model = Model(VOCAB_SIZE, EMBED_DIM)
dataset = CustomDataset(toxic_data, toxic_classes)
dataloader = DataLoader(dataset, shuffle=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
epochs = 15

In [8]:
model.train()
total_acc, total_count = 0, 0
log_interval = 3000
start_time = time.time()


for epoch in range(epochs):
    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        l = loss
        loss.backward()
        optimizer.step()
        if torch.all((predicted_label>0.5).float().eq(label)):
            total_acc+=1
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} | loss {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count, l
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()




| epoch   0 |  3000/30000 batches | accuracy    0.452 | loss    0.895
| epoch   0 |  6000/30000 batches | accuracy    0.528 | loss    0.202
| epoch   0 |  9000/30000 batches | accuracy    0.566 | loss    0.023
| epoch   0 | 12000/30000 batches | accuracy    0.567 | loss    0.058
| epoch   0 | 15000/30000 batches | accuracy    0.593 | loss    0.761
| epoch   0 | 18000/30000 batches | accuracy    0.568 | loss    0.387
| epoch   0 | 21000/30000 batches | accuracy    0.602 | loss    0.011
| epoch   0 | 24000/30000 batches | accuracy    0.617 | loss    0.053
| epoch   0 | 27000/30000 batches | accuracy    0.591 | loss    0.037
| epoch   1 |  3000/30000 batches | accuracy    0.624 | loss    0.015
| epoch   1 |  6000/30000 batches | accuracy    0.622 | loss    0.014
| epoch   1 |  9000/30000 batches | accuracy    0.635 | loss    0.006
| epoch   1 | 12000/30000 batches | accuracy    0.647 | loss    0.002
| epoch   1 | 15000/30000 batches | accuracy    0.653 | loss    0.175
| epoch   1 | 18000/

In [9]:
torch.save(model,'saved_model.pt')
torch.save(vocab, 'saved_vocab.pth')

In [89]:
test = "how can u be this fkn dumb lol?"
def GetPrediction(text):
    response = []
    classes = ['Toxic','Severe Toxic', 'Obscene', 'Threat', 'Insult', 'Identity Hate']
    text = text_pipeline(text)
    text = torch.LongTensor(text)
    text = text[None,:]
    pred = model(text)[0]
    results = (pred>0.9).nonzero()
    for r in results:
        response.append(classes[r[0].item()])
    return response, pred

GetPrediction(test)

(['Toxic'],
 tensor([9.9852e-01, 1.4484e-02, 5.4608e-05, 3.8599e-06, 2.2561e-03, 1.0767e-03],
        grad_fn=<SelectBackward0>))