In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import re
import spacy
from spacy.symbols import ORTH
import itertools

import torch.nn as nn
from torch.autograd import Variable
import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from sklearn.model_selection import train_test_split
import dill as pickle
import io

In [3]:
SEED = 123456

In [4]:
!ls data

sample_submission.csv  train.csv  trn_txt.csv  val_txt.csv
test.csv	       trn.csv	  val.csv


In [5]:
# Look at training data
cmt = pd.read_csv('data/train.csv')
cmt.shape

(95851, 8)

In [6]:
cmt.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [7]:
labels = cmt.columns[2:]

In [8]:
cmt[labels].describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492
std,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
cmt['num_labels'] = cmt[labels].sum(axis=1)
cmt['num_labels'].describe()

count    95851.000000
mean         0.221124
std          0.750533
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          6.000000
Name: num_labels, dtype: float64

In [10]:
# Define tokenizer
# https://github.com/fastai/fastai/blob/dc61200b18f25a42b8e803c5ca7be48509f562ef/fastai/nlp.py#L16-L21
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)

def sub_br(x):
    return re.sub(r'\n', ' ', re_br.sub(' ', x))

def sub_quote(x):
    return re.sub(r'"', ' ', x)

my_tok = spacy.load('en')
my_tok.tokenizer.add_special_case('<eos>', [{ORTH: '<eos>'}])

def spacy_tok(x):
    return [tok.text for tok in my_tok.tokenizer(sub_quote(sub_br(x)))]



    Only loading the 'en' tokenizer.



In [11]:
# Clean text
cmt['comment_text_cleaned'] = [' '.join(spacy_tok(txt)).strip() for txt in cmt['comment_text']]
cmt[['comment_text', 'comment_text_cleaned']].head()

Unnamed: 0,comment_text,comment_text_cleaned
0,"Nonsense? kiss off, geek. what I said is true...","Nonsense ? kiss off , geek . what I said is ..."
1,"""\n\n Please do not vandalize pages, as you di...","Please do not vandalize pages , as you did wit..."
2,"""\n\n """"Points of interest"""" \n\nI removed the...",Points of interest I removed the point...
3,Asking some his nationality is a Racial offenc...,Asking some his nationality is a Racial offenc...
4,The reader here is not going by my say so for ...,The reader here is not going by my say so for ...


In [12]:
# Split into training and validation set
cmt_trn, cmt_val, cmt_trn_y, cmt_val_y = train_test_split(cmt[['id', 'comment_text_cleaned']], cmt[labels], random_state=SEED)
cmt_trn.shape, cmt_val.shape, cmt_trn_y.shape, cmt_val_y.shape

((71888, 2), (23963, 2), (71888, 6), (23963, 6))

In [13]:
cmt_trn = cmt_trn.join(cmt_trn_y)
cmt_val = cmt_val.join(cmt_val_y)

In [14]:
# Save only the text content
cmt_trn['comment_text_cleaned'].to_csv('data/trn_txt.csv', index=False, header=False)
cmt_val['comment_text_cleaned'].to_csv('data/val_txt.csv', index=False, header=False)

## Create language model

### Create text field and splits

In [15]:
# Create a torchtext field
TEXT = data.Field(lower=True, eos_token='<eos>')

In [16]:
# Create splits
trn, val = data.TabularDataset.splits(
    path='data',
    train='trn_txt.csv',
    validation='val_txt.csv',
    format='csv',
    fields=[('text', TEXT)])

In [17]:
len(trn), len(val)

(71888, 23963)

In [18]:
# Build vocabulary
TEXT.build_vocab(trn, min_freq=10)

In [19]:
# Vocabulary size
vocab_size = len(TEXT.vocab)
vocab_size

16629

In [20]:
# Vocabulary dict
# Most frequent words
TEXT.vocab.itos[:10]

['<unk>', '<pad>', '<eos>', '.', 'the', ',', 'to', 'i', 'of', 'and']

In [21]:
# Least frequent words
TEXT.vocab.itos[-10:]

['zurich', 'zzuuzz', '|b', '~*~', '\x93', '—aco', '•talk•', '☠', '✄', '😉']

In [22]:
# Check out the first sentence in the training set
trn.examples[0].text[:20]

['tag',
 'brewcrewer',
 ',',
 'if',
 'you',
 'do',
 'nt',
 'want',
 'to',
 'edit',
 'war',
 'over',
 'something',
 'like',
 'this',
 'why',
 'are',
 'you',
 'reverting',
 '?']

In [23]:
# Check out comment lengths
pd.Series([len(txt.text) for txt in trn.examples]).describe()

count    71888.000000
mean        78.877003
std        119.938922
min          1.000000
25%         20.000000
50%         42.000000
75%         87.000000
max       4948.000000
dtype: float64

### Create iterators

In [24]:
# https://github.com/fastai/fastai/blob/dc61200b18f25a42b8e803c5ca7be48509f562ef/fastai/nlp.py#L139
class LanguageModelIterator():
    def __init__(self, ds, bs, bptt):
        self.bs = bs
        self.bptt = bptt

        # Combine all tokens
        toks = [tok.text for tok in ds]
        toks = list(itertools.chain(*toks))

        # Convert tokens to index
        fld = ds.fields['text']
        idx = fld.numericalize([toks])

        # Split into batches
        self.data = self.batchify(idx)
        self.n_toks_per_batch = len(self.data)

        # To point to the index of the next token
        self.i = 0

        # To track iterations
        self.iter = 0

    def __iter__(self):
        self.i = 0
        self.iter = 0
        return self

    def __len__(self):
        # Number of iterations to go over an epoch
        return self.n_toks_per_batch // self.bptt - 1

    def __next__(self):
        if self.i >= self.n_toks_per_batch - 1 or self.iter >= len(self):
            raise StopIteration

        # Halve `bptt` 5% of the time
        bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.

        # Add variability to sequence length
        seq_len = max(5, int(np.random.normal(bptt, 5)))

        res = self.get_batch(self.i, seq_len)
        
        # Update pointers
        self.i += seq_len
        self.iter += 1
        return res

    # Split data into batches
    def batchify(self, toks):
        n_toks_per_batch = len(toks) // self.bs

        # Discard the trailing tokens
        toks = toks[:n_toks_per_batch * bs]

        # Reshape to (n_toks_per_batch, bs)
        toks_reshaped = toks.view(bs, -1).t().contiguous()
        return toks_reshaped.cuda()
    
    # Get the next batch
    def get_batch(self, i, seq_len):
        data = self.data
        seq_len = min(seq_len, self.n_toks_per_batch - i - 1)
        
        # Offset output by 1 token
        return data[i:i + seq_len], data[i + 1:i + 1 + seq_len].view(-1)

In [25]:
bs = 64
bptt = 30

In [26]:
trn_dl, val_dl = [LanguageModelIterator(ds, bs, bptt) for ds in (trn, val)]

In [27]:
# Number of batches (or iterations)
len(trn_dl), len(val_dl)

(2952, 977)

In [29]:
# Check out a batch
X, y = next(iter(trn_dl))
X, y.view(-1, bs)

(Variable containing:
    261  16221     29  ...    2403      6    700
      0     46     75  ...       6     23    179
      5    470      0  ...       4    630     16
         ...            ⋱           ...         
     24     38     71  ...     170      6    290
    206   2600   7019  ...       3     23     88
     21     22     15  ...      15    630      5
 [torch.cuda.LongTensor of size 37x64 (GPU 0)], Variable containing:
      0     46     75  ...       6     23    179
      5    470      0  ...       4    630     16
     34    107     27  ...     489      3     14
         ...            ⋱           ...         
    206   2600   7019  ...       3     23     88
     21     22     15  ...      15    630      5
      7   5060      4  ...       4      3      7
 [torch.cuda.LongTensor of size 37x64 (GPU 0)])

### Encoder

In [42]:
# https://github.com/fastai/fastai/blob/5c9ea4c3533a132b45c1ec577c2cbb72ab24d4c0/fastai/lm_rnn.py#L18
def repackage_var(h):
    # Wraps h in new Variables, to detach them from their history.
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_var(v) for v in h)

In [54]:
# Loosely based on https://github.com/fastai/fastai/blob/5c9ea4c3533a132b45c1ec577c2cbb72ab24d4c0/fastai/lm_rnn.py#L23
class RNN_Encoder(nn.Module):
    def __init__(self, bs, vocab_size, embedding_size, hidden_size, n_layers, padding_idx,
                 dropout_embedding, dropout_lstm):
        super().__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        
        # Embedding dropout
        self.embedding_dropout = nn.Dropout(p=dropout_embedding)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout = dropout_lstm)
        
        self.bs, self.hidden_size, self.n_layers = bs, hidden_size, n_layers
    
    def forward(self, input):
        input_embedded = self.embedding_dropout(self.embedding(input))  # (seq_len, bs, embedding_size)
        output, (hidden_state, cell_state) = self.lstm(input_embedded, self.hidden)  # (seq_len, bs, hidden_size)
        self.hidden = repackage_var(new_hidden)
        return output
    
    def reset(self):
        weight = next(self.parameters()).data
        self.hidden = (Variable(weight.new(self.n_layers, self.bs, self.hidden_size).zero_()),
                       Variable(weight.new(self.n_layers, self.bs, self.hidden_size).zero_()))

In [34]:
embedding_size = 200
padding_idx = TEXT.vocab.stoi[TEXT.pad_token]
dropout_embedding = 0.02
dropout_lstm = 0.05
hidden_size = 500
n_layers = 3

In [36]:
# Create encoder instance
rnn_encoder = RNN_Encoder(bs, vocab_size, embedding_size, hidden_size, n_layers, padding_idx, dropout_embedding, dropout_lstm)
rnn_encoder

RNN_Encoder(
  (embedding): Embedding(16629, 200, padding_idx=1)
  (embedding_dropout): Dropout(p=0.02)
  (lstm): LSTM(200, 500, num_layers=3, dropout=0.05)
)

In [46]:
# Test encoder
rnn_encoder_test = rnn_encoder.cuda()
rnn_encoder_test.init_hidden()
rnn_encoder_test(X).shape

torch.Size([37, 64, 500])

### Decoder

In [55]:
# https://github.com/fastai/fastai/blob/5c9ea4c3533a132b45c1ec577c2cbb72ab24d4c0/fastai/lm_rnn.py#L131
class RNN_Decoder_LM(nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super().__init__()
        
        # Linear layer
        self.decoder = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input):
        return self.decoder(input)

In [56]:
# Create Decoder instance
rnn_decoder_lm = RNN_Decoder_LM(hidden_size, vocab_size)
rnn_decoder_lm

RNN_Decoder_LM(
  (decoder): Linear(in_features=500, out_features=16629)
)

In [62]:
# Test decoder
rnn_decoder_lm_test = rnn_decoder_lm.cuda()
rnn_decoder_lm_test(rnn_encoder_test(X)).shape

torch.Size([37, 64, 16629])