# Assignment 6

Develop RNN model in pytorch to solve the following problem:  
    
1. Detect sarcasm 
Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit  
Your quality metric = accuracy  
Randomly select 20% of your data for test set. You can use it only for final perfomance estimation.   
 

Remember, you can use GPU resourses in kaggle kernels.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv("../input/sarcastic-comments-on-reddit/train-balanced-sarcasm.csv")

In [None]:
data.head()

In [None]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [None]:
from sklearn.externals import joblib
import nltk
import gensim
import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset



SEED = 42
np.random.seed(SEED)

In [None]:
TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')
            )
LABEL = LabelField(dtype=tt.int64, use_vocab=True)
SCORE = Field(dtype=tt.int64, use_vocab=True)
# label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
dataset = TabularDataset('../input/sarcastic-comments-on-reddit/train-balanced-sarcasm.csv', format='csv', 
                         fields=[('label', LABEL),('comment', TEXT),('score', SCORE),(None, None),(None, None),
                                 (None, None),(None, None),(None, None),(None, None),(None, None)],
                         skip_header=True)

In [None]:
TEXT.build_vocab(dataset, min_freq=5)
len(TEXT.vocab.itos)

In [None]:
TEXT.vocab.itos[:10]

In [None]:
LABEL.build_vocab(dataset)

In [None]:
SCORE.build_vocab(dataset)

In [None]:
train, test = dataset.split(0.8, stratified=True)  # 20% for test
train, valid = train.split(0.7, stratified=True)

In [None]:
np.unique([x.label for x in train.examples], return_counts=True)

In [None]:
np.unique([x.label for x in valid.examples], return_counts=True)

In [None]:
np.unique([x.label for x in test.examples], return_counts=True)

In [None]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 3)
        
    def forward(self, batch):
        
        x, x_lengths = batch.comment
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [None]:
tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.comment),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [None]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            # loss = criterion(batch.label, pred)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [None]:
#nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler,
#         n_epochs=10, early_stopping=2)

In [None]:
#for batch in test_iterator:
#    pred = model(batch)
#    pred = tt.softmax(pred, dim=1)
#    pred = tt.argmax(pred, dim=1)
    
def eval(model, iterator, criterion):
    epoch_loss = 0
    model.eval()
    with tt.no_grad():
        for batch in iterator:
            predictions = model(batch)  #.comment).squeeze(1)
            pred = tt.softmax(predictions, dim=-1)
            pred = pred.detach().numpy()
            pred = np.argmax(pred, axis=1)
            loss = criterion(batch.label, pred)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [None]:
# test
from sklearn.metrics import accuracy_score

print('Starting predicting...')
print('test', eval(model, test_iterator, accuracy_score))

## With embeddings

In [None]:
import gensim.models.keyedvectors as word2vec

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file='../input/glove-twitter/glove.twitter.27B.25d.txt',
               word2vec_output_file="gensim_glove_vectors.txt")

In [None]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=True, encoding='utf-8', unicode_errors='ignore')

In [None]:
#model0 = gensim.models.KeyedVectors.load_word2vec_format("../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin",
#                                                         binary=True, encoding='utf-8', unicode_errors='ignore')

In [None]:
embed_matrix = tt.FloatTensor(glove_model.syn0)

In [None]:
class MyModel2(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel2, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding = nn.Embedding.from_pretrained(embed_matrix, freeze=True)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 3)
        
    def forward(self, batch):
        
        x, x_lengths = batch.comment
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [None]:
tt.cuda.empty_cache()

batch_size = 200

model2 = MyModel2(len(TEXT.vocab.itos),
                  embed_size=25,
                  hidden_size=128)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.comment),
    sort_within_batch=True,
)

optimizer = optim.Adam(model2.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [None]:
#nn_train(model2, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler,
#         n_epochs=2, early_stopping=2)

In [None]:
# test
from sklearn.metrics import accuracy_score

print('Starting predicting...')
print('test', eval(model2, test_iterator, accuracy_score))
# 0.678 for model with pretrained word2vec
# ? for model with pretrained glove

## With gradient clipping

In [None]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        tt.nn.utils.clip_grad_norm_(model.parameters(), 0.25)  # deprecation warning
        optimizer.step()
        
        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

In [None]:
tt.cuda.empty_cache()

batch_size = 32
LR = 0.005

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.comment),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters(), amsgrad=True, betas=(0.9,0.98), eps=1e-9)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [None]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler,
         n_epochs=3, early_stopping=2)

In [None]:
# test
from sklearn.metrics import accuracy_score

print('Starting predicting...')
print('test', eval(model, test_iterator, accuracy_score))
# 0.688 for model with gradient clipping
# 0.683 with gradient clipping & amsgrad=True and B2=0.98 and eps=1e-9
# 0.681 with gradient clipping & amsgrad=True and B2=0.98 and eps=1e-9 and adding score column 

Самый лучший результат 0.688 получился с gradient clipping и БЕЗ векторов, без усложнение Adam (с помощью amsgrad изменения В2 на 0.98 и eps=1e-9 и без добавления колонки score)