Develop RNN model in pytorch to solve the following problem:

Detect sarcasm Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit Your quality metric = accuracy Randomly select 20% of your data for test set. You can use it only for final perfomance estimation. Remember, you can use GPU resourses in kaggle kernels.

In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset

from string import punctuation
from nltk.corpus import stopwords
import warnings
from tqdm import tqdm_notebook

warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Данные

Сначала посмотрим на данные:

In [5]:
df = pd.read_csv("/Users/Stoneberry/Desktop/Uni/Прога/4 курс/Комп/train-balanced-sarcasm.csv", sep=',')

In [6]:
df.label.value_counts()

1    505413
0    505413
Name: label, dtype: int64

In [7]:
df = df.dropna(subset=['comment'])

In [8]:
df.label.value_counts()

0    505405
1    505368
Name: label, dtype: int64

In [9]:
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


Есть теория, что parent_comment и subreddit тоже могут влиять на качество классификации, поэтому создадим дополнительную колонку, в которую соединим 3 столбца

In [None]:
c = []

for i in df.values:
    item = i[1].split()
    item += i[9].split()
    item += i[3].split()
    item = ' '.join(item)
    c.append(item)

df['text'] = c
df.to_csv('train-balanced-sarcasm_demo3.csv', index=False)

## Основные функции

In [4]:
stops = set(stopwords.words('english'))
punct = punctuation+'«»—…“”*№–'

import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')


def tokenizer1(text):
    return [tok.lemma_ for tok in spacy_en.tokenizer(text)]
    #words = [word for word in text.lower().split()]
    #return words

def tokenizer2(text):
    words = [word.strip(punct) for word in text.lower().split()]
    return words

ModuleNotFoundError: No module named 'spacy'

In [4]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, w2v=False, drop=False, mod='text'):
        super(MyModel, self).__init__()
        
        if w2v is True:
            global weights 
            self.embedding = nn.Embedding.from_pretrained(weights, freeze=True)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size)
           # tt.nn.init.uniform_(self.embedding.weight)
        
        self.mod = mod
        
        if drop is True:
            self.drop_en = nn.Dropout(p=0.6)
        else:
            self.drop_en = False
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        self.fc = nn.Linear(hidden_size * 2 *2, 2)
        
    def forward(self, batch):
        
        if self.mod == 'text':
            x, x_lengths = batch.text
        if self.mod == 'comment':
            x, x_lengths = batch.comment
        if self.mod == 'parent_comment':
            x, x_lengths = batch.parent_comment
        
        x = self.embedding(x)
        if self.drop_en is not False:
            x = self.drop_en(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [5]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0, cri=False):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)
        
        if cri is True:
            scheduler.step(valid_loss)

In [6]:
def predict(batch, model, proba=True):
    
    global TEXT
    prediction = tt.softmax(model.forward(batch), dim=-1)
    prediction = prediction.detach().numpy()
    
    if proba is True: return prediction
    else: return prediction.argmax(axis=1)

In [7]:
def text_formaion(path, tok='tokenizer1', field='comment', max_size=30000):
    
    if tok == 'tokenizer1':
        TEXT = Field(include_lengths=True, batch_first=True, 
                 tokenize=tokenizer1,
                 eos_token='<eos>',
                 lower=True,
                 stop_words=nltk.corpus.stopwords.words('english')
                )
    else:
        TEXT = Field(include_lengths=True, batch_first=True, 
                 tokenize=tokenizer2,
                 eos_token='<eos>',
                 lower=True,
                 stop_words=nltk.corpus.stopwords.words('english')
                )

    LABEL = LabelField(dtype=tt.int64, use_vocab=False)

    dataset = TabularDataset(path, format='csv', 
                         fields=[('label', LABEL), (field, TEXT)], 
                         skip_header=True)
    
    TEXT.build_vocab(dataset, min_freq=5, max_size=max_size)
    print(len(TEXT.vocab.itos))
    LABEL.build_vocab(dataset)
    train, test = dataset.split(0.8, stratified=True)
    train, valid = train.split(0.7, stratified=True)
    
    return TEXT, train, test, valid

In [18]:
def model_train(TEXT, train, valid, test, batch_size=100, w2v=False, drop=False, mod='text', n_epochs=10, embed_size=100, cri=False):
    
    batch_size = batch_size 

    model = MyModel(len(TEXT.vocab.itos),
                embed_size=embed_size,
                hidden_size=128,
                w2v=w2v, drop=drop, mod=mod)

    if mod == 'text':
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test),
        batch_sizes=(batch_size, batch_size, batch_size), shuffle=True, sort_key=lambda x: len(x.text),
        sort_within_batch=True,)
    if mod == 'comment':
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test),
        batch_sizes=(batch_size, batch_size, batch_size), shuffle=True, sort_key=lambda x: len(x.comment),
        sort_within_batch=True,)
    if mod == 'parent_comment':
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test),
        batch_sizes=(batch_size, batch_size, batch_size), shuffle=True, sort_key=lambda x: len(x.parent_comment),
        sort_within_batch=True,)
        

    optimizer = optim.Adam(model.parameters())
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=7)
    criterion = nn.CrossEntropyLoss()
    
    nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=n_epochs, early_stopping=2, cri=cri)
    
    return model, test_iterator

In [9]:
def acc_score(model, test_iterator, proba=False):
    res = []
    t_par = tqdm_notebook(test_iterator, desc='i', leave=True)

    for i in t_par:
        pred = predict(i, model, proba=proba)
        res.append(accuracy_score(np.array(i.label), pred))

    return np.mean(res)

In [11]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True)

weights = tt.FloatTensor(w2v_model.vectors)

In [12]:
path = '../input/my-sarcasm/train-balanced-sarcasm2.csv'

## Обучение

Колонка - comment, с удалением пунктуации, первоначальный код

In [15]:
TEXT2, train2, test2, valid2 = text_formaion(path, tok='tokenizer2', field='comment', max_size=30000)
model2, test_iterator2 = model_train(TEXT2, train2, valid2, test2, batch_size=100, w2v=False, drop=False, mod='comment', n_epochs=10, embed_size=100, cri=False)
acc_score(model2, test_iterator2, proba=False)

30003


HBox(children=(IntProgress(value=0, description='epoch 0', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.58251


HBox(children=(IntProgress(value=0, description='epoch 1', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.57621


HBox(children=(IntProgress(value=0, description='epoch 2', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.58867


HBox(children=(IntProgress(value=0, description='epoch 3', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.62901
Early stopping! best epoch: 1 val 0.57621


HBox(children=(IntProgress(value=0, description='i', max=2022, style=ProgressStyle(description_width='initial'…




0.6842496178401223

Если посмотреть, на тексты, можно увидеть, что там знаки препинания могут тоже нести какую-то информацию.

In [13]:
df.label[233], df.comment[233]

(0,
 'For those wondering, those are not my pictures :P. However, I did want to pick epic landscapes given the artist name and the epicness of this mix :D')

In [12]:
df.label[434244], df.comment[434244]

(1, '***ACKCHUALLY*** THE TERM IS LEVICORPUS WHEN TALKING ABOUT BODIES')

In [35]:
df.label[305], df.comment[305]

(1, 'Language, Cap... such a hypocrite')

In [34]:
df.comment[305]

'Language, Cap... such a hypocrite'

In [None]:
df.label[305]

Проверим, улучшится ли результат, если не удалять пунктуацию

Колонка - comment, без удаления пунктуации, первоначальный код

In [16]:
TEXT, train, test, valid = text_formaion(path, tok='tokenizer1', field='comment', max_size=30000)
model, test_iterator = model_train(TEXT, train, valid, test, batch_size=100, w2v=False, drop=False, mod='comment', n_epochs=10, embed_size=100, cri=False)
acc_score(model, test_iterator, proba=False)

30003


HBox(children=(IntProgress(value=0, description='epoch 0', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.56358


HBox(children=(IntProgress(value=0, description='epoch 1', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.55803


HBox(children=(IntProgress(value=0, description='epoch 2', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.56765


HBox(children=(IntProgress(value=0, description='epoch 3', max=5661, style=ProgressStyle(description_width='in…


validation loss 0.59915
Early stopping! best epoch: 1 val 0.55803


HBox(children=(IntProgress(value=0, description='i', max=2022, style=ProgressStyle(description_width='initial'…




0.7018712346012048

#### Удалось побить бейзлайн! Попробуем еще несколько вариантов. 

Так как в Kaggle сессия длится ограниченное время, ноутбуки постоянно вылетали, поэтому разные вещи я считала в нескольких разных кернелах. Если успеет досчитаться, выложу отдельными файлами.

Колонка - comment, без удаления пунктуации, первоначальный код, w2v

In [None]:
model, test_iterator = model_train(TEXT, train, valid, test, batch_size=100, w2v=True, drop=False, mod='comment', n_epochs=10, embed_size=300, cri=False)
acc_score(model, test_iterator, proba=False)

HBox(children=(IntProgress(value=0, description='epoch 0', max=5661, style=ProgressStyle(description_width='in…

Колонка - comment, без удаления пунктуации, измененный код

In [None]:
model, test_iterator = model(TEXT, train, valid, test, batch_size=100, w2v=False, drop=True, mod='comment', n_epochs=10, embed_size=100, cri=False)
acc_score(model, test_iterator, proba=False)

Колонка - comment, без удаления пунктуации, измененный код, в2в

In [None]:
model, test_iterator = model(TEXT, train, valid, test, batch_size=100, w2v=True, drop=True, mod='comment', n_epochs=10, embed_size=300, cri=False)
acc_score(model, test_iterator, proba=False)

Колонка - parent+subr+comment, без удаления пунктуации, первоначальный код

In [None]:
model, test_iterator = model(TEXT, train, valid, test, batch_size=100, w2v=False, drop=False, mod='text', n_epochs=10, embed_size=100, cri=False)
acc_score(model, test_iterator, proba=False)

#### Возможно, также влияет регистр текста

In [40]:
def text_formaion2(path, tok='tokenizer1', field='comment', max_size=30000):
    
    if tok == 'tokenizer1':
        TEXT = Field(include_lengths=True, batch_first=True, 
                 tokenize=tokenizer1,
                 eos_token='<eos>',
                 lower=False,
                 stop_words=nltk.corpus.stopwords.words('english')
                )
    else:
        TEXT = Field(include_lengths=True, batch_first=True, 
                 tokenize=tokenizer2,
                 eos_token='<eos>',
                 lower=False,
                 stop_words=nltk.corpus.stopwords.words('english')
                )

    LABEL = LabelField(dtype=tt.int64, use_vocab=False)

    dataset = TabularDataset(path, format='csv', 
                         fields=[('label', LABEL), (field, TEXT)], 
                         skip_header=True)
    
    TEXT.build_vocab(dataset, min_freq=5, max_size=max_size)
    print(len(TEXT.vocab.itos))
    LABEL.build_vocab(dataset)
    train, test = dataset.split(0.8, stratified=True)
    train, valid = train.split(0.7, stratified=True)
    
    return TEXT, train, test, valid

Колонка - comment, с удалением пунктуации, сохранение капса, первоначальный код

In [39]:
TEXT2, train2, test2, valid2 = text_formaion2(path, tok='tokenizer2', field='comment', max_size=30000)
model2, test_iterator2 = model_train(TEXT2, train2, valid2, test2, batch_size=100, w2v=False, drop=False, mod='comment', n_epochs=10, embed_size=100, cri=False)
acc_score(model2, test_iterator2, proba=False)

NameError: name 'text_formaion' is not defined

In [None]:
Колонка - comment, без удаления пунктуации, сохранение капса, первоначальный код

In [None]:
TEXT, train, test, valid = text_formaion2(path, tok='tokenizer1', field='comment', max_size=30000)
model, test_iterator = model_train(TEXT, train, valid, test, batch_size=100, w2v=False, drop=False, mod='comment', n_epochs=10, embed_size=100, cri=False)
acc_score(model, test_iterator, proba=False)