# Simple CNN and RNN

In [7]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator



SEED = 42
np.random.seed(SEED)

In [8]:
!head Tweets.csv

"head" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


# 1 TorchText

In [9]:
import spacy


spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [10]:
classes={
    'negative':0,
    'neutral':1,
    'positive':2
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('Tweets.csv', format='csv', 
                         fields=[(None, None),('label', LABEL), (None, None),(None, None),('text', TEXT)], 
                         skip_header=True)

In [11]:
# TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")
TEXT.build_vocab(dataset, min_freq=5)
len(TEXT.vocab.itos)

2748

In [12]:
TEXT.vocab.itos[:10]

['<unk>',
 '<pad>',
 '<eos>',
 'flight',
 'get',
 'thanks',
 'cancelled',
 'service',
 'help',
 'time']

In [13]:
LABEL.build_vocab(dataset)

In [14]:
train, test = dataset.split(0.7, stratified=True)
train, valid = train.split(0.7, stratified=True)

In [15]:
np.unique([x.label for x in train.examples], return_counts=True)

(array([0, 1, 2]), array([4498, 1518, 1158], dtype=int64))

In [16]:
np.unique([x.label for x in valid.examples], return_counts=True)

(array([0, 1, 2]), array([1927,  651,  496], dtype=int64))

In [17]:
np.unique([x.label for x in test.examples], return_counts=True)

(array([0, 1, 2]), array([2753,  930,  709], dtype=int64))

# 3 Convolutional NN for text classification

Formal definition of convolution of functions $f$ and $g$
$$ (f∗g)(t)= \int_0^{\infty} f(\tau)g(t−\tau) d{\tau} $$

![img](http://www.stokastik.in/wp-content/uploads/2016/09/convolution_ilustration.png)

In [18]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        
        self.fc = nn.Linear(hidden_size * len(kernels), 3)
        
    def forward(self, x, x_lengths):
        
        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.fc(x)
        return x

In [19]:
tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
                kernels=[2,3,4,5]
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
#     sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
criterion = nn.CrossEntropyLoss()

In [22]:
size = train_iterator.shape[0]

AttributeError: 'BucketIterator' object has no attribute 'shape'

In [20]:
import numpy as np

In [None]:
# train ...
#model = MyModel(vocab_size, embed_size, hidden_size, kernels)
n_epochs = 1000
min_loss = 0
early = 0
for epoch in range(n_epochs):  
    loss = 0
    for batch in train_iterator:
        optimizer.zero_grad()
        pred = model.forward(batch.text[0], None)
        criterion(pred, batch.label).backward()
        
        optimizer.step()
        
    for batch in valid_iterator:
        pred_valid = model.forward(batch.text[0], None)
        loss += criterion(pred_valid, batch.label)
    
    print('Epoch [%d/%d], Loss: %.4f'
          %(epoch+1, n_epochs, loss))
        
    # early stopping
    if loss < min_loss:
        min_loss = loss
        early = 0
    else:
        early += 1
    if early > 10:
        break

Epoch [1/1000], Loss: 82.4154
Epoch [2/1000], Loss: 82.3120
Epoch [3/1000], Loss: 74.9462
Epoch [4/1000], Loss: 84.9559
