In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

from sklearn.externals import joblib
import nltk
import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset, Iterator

SEED = 42
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [None]:
spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(text):
    return [tok.lemma_ for tok in spacy_en.tokenizer(text)]

In [None]:
dataframe = pd.read_csv('../input/train-balanced-sarcasm.csv')
dataframe = dataframe.dropna()
dataframe = dataframe[['label', 'comment']]
dataframe.to_csv('data.csv', index=False)

In [None]:
classes={'0':0,
         '1':1}

TEXT = Field(include_lengths=True,
             batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=tt.int64,
                   use_vocab=True,
                   preprocessing=lambda x: classes[x])

dataset = TabularDataset('data.csv',
                         format='csv', 
                         fields=[('label', LABEL),('comment', TEXT)], 
                         skip_header=True)

In [None]:
TEXT.build_vocab(dataset, min_freq=5, vectors="glove.6B.300d")
LABEL.build_vocab(dataset)

embed_matrix = TEXT.vocab.vectors

In [None]:
train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.9, stratified=True)

In [39]:
class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, embed_matrix):
        
        super(RNNModel, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embed_matrix, freeze=False)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True
                          )
        
        self.fc = nn.Linear(hidden_size * 2 * 2, 2)
        
        
    def forward(self, batch):
        
        x, x_len = batch.comment
        
        x = x.cuda()
        batch.label = batch.label.cuda()
        
        x = self.embedding(x)

        if x_len is not None:
            x_len = x_len.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden, cell = hidden.transpose(0,1), cell.transpose(0,1)

        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)

        return x

In [46]:
tt.cuda.empty_cache()

batch_size = 600

model = RNNModel(len(TEXT.vocab.itos),
                embed_size=300,
                embed_matrix=embed_matrix,
                hidden_size=600)

model = model.cuda()

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.comment),
    sort_within_batch=True
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [47]:
def calculate_accuracy(predictions, Y):
    
    values, indices = tt.max(predictions.data, 1)
    acc = (indices == Y).sum().data.cpu().numpy()
    
    return acc / indices.size()[0]


def evaluate_test_accuracy(model, test_iterator, criterion):
    
    model.eval()
    
    epoch_accuracy, n_batches = 0, len(test_iterator)
    
    with tt.no_grad():
        for batch in test_iterator:
            
            pred = model(batch)
            
            accuracy = calculate_accuracy(pred, batch.label)
            epoch_accuracy += accuracy.item()
        
    return epoch_accuracy / n_batches

In [48]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):
    model.train()

    running_loss, epoch_acc = 0, list()

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator,
                             total=n_batches,
                             desc='Current epoch – %d' % (curr_epoch),
                             leave=True)

    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)
        
        accuracy = calculate_accuracy(pred, batch.label)
        epoch_acc.append(accuracy.item())

    return running_loss, np.mean(epoch_acc)

def _test_epoch(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss, epoch_acc = 0, list()
    
    n_batches = len(iterator)
    
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()
            
            accuracy = calculate_accuracy(pred, batch.label)
            epoch_acc.append(accuracy.item())

    return epoch_loss / n_batches, np.mean(epoch_acc)


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, test_iterator, n_epochs=100,
             scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss, train_acc = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss, valid_acc = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('Validation loss – %.5f | Accuracy score – %.5f' % (valid_loss, valid_acc))

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                line = 'Training early stopping!\nBest epoch – %d\nVal – %.5f'
                print(line % (best_epoch['epoch'],
                              best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [49]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, test_iterator, scheduler=scheduler, 
         n_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='Current epoch – 0', max=1213, style=ProgressStyle(description…

Validation loss – 0.54862 | Accuracy score – 0.71653


HBox(children=(IntProgress(value=0, description='Current epoch – 1', max=1213, style=ProgressStyle(description…

Validation loss – 0.54533 | Accuracy score – 0.72184


HBox(children=(IntProgress(value=0, description='Current epoch – 2', max=1213, style=ProgressStyle(description…

Validation loss – 0.55840 | Accuracy score – 0.71908


HBox(children=(IntProgress(value=0, description='Current epoch – 3', max=1213, style=ProgressStyle(description…

Validation loss – 0.60437 | Accuracy score – 0.71022
Training early stopping!
Best epoch – 1
Val – 0.54533


In [None]:
print('Final Test Accuracy – %.5f' % evaluate_test_accuracy(model,
                                                     test_iterator,
                                                     criterion))