# Assignment 5

Explore embeddings and CNN

Using tweeter dataset ( https://www.kaggle.com/utathya/sentiment-analysis-of-imdb-reviews/data) and already existing splitting on train/test sets, develop a model for sentiment analysis.  
Your quality metric is crossentropy (`sklearn.metrics.log_loss`, `torch.nn.CrossEntropy`)  

Your model should be based on CNN and written in pytorch `torch.nn.Conv1d`.

Explore 3 ways, which of them has better perfomance on test set?:
1. Using pretrained word embeddings with frozen weights. `torch.nn.Embedding.from_pretrained(embed_matrix, frozen=True)`.
You can download any pretrained embeddings you like.
2. Train embeddings yourself, initializing from pretrained weights. `torch.nn.Embedding.from_pretrained(embed_matrix, frozen=False)`
3. Train embeddings yourself with random initialization `torch.nn.init.uniform_(nn.Embeddings.weights)`

Experiment with model arhitecture (as bonus):
1. activation functions RELU, ELU, etc `torch.nn.functional.relu, torch.nn.functional.elu`  
1. Stack layers
1. max or average pooling (`torch.nn.functional.max_pool1d, torch.nn.functional.avg_pool1d`)
1. Dropout


Other requirements:
1. Your training routine should use any learning scheduler (torch.nn.optim.lr_scheduler).  
Plot current learning rate against number of epochs, e.g. `tt.optim.Adam.param_groups[0]['lr']`.  

1. Your training routine should use early stopping on validation dataset
Use hold out validation scheme in your experiments.

1. You should use torchtext for parsing 

**Note**:
This assignment has no baseline. Instead, **validation curves** for explored architectures are expected.  
Validation curve is plot of model test score against model hyperparameter value.  
example (https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html) . 
Though, using `sklearn.model_selection.validation_curve` is not necessary.  






In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator



SEED = 42
np.random.seed(SEED)



In [2]:
import spacy


spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [3]:
classes={
    'negative':0,
    'neutral':1,
    'positive':2
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('Tweets.csv', format='csv', 
                         fields=[(None, None),('label', LABEL), (None, None),(None, None),('text', TEXT)], 
                         skip_header=True)

In [4]:
# TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")
TEXT.build_vocab(dataset, min_freq=5)
len(TEXT.vocab.itos)

2748

In [5]:
TEXT.vocab.itos[:10]

['<unk>',
 '<pad>',
 '<eos>',
 'flight',
 'get',
 'thanks',
 'cancelled',
 'service',
 'help',
 'time']

In [6]:
LABEL.build_vocab(dataset)

In [7]:
train, test = dataset.split(0.7, stratified=True)
train, valid = train.split(0.7, stratified=True)

In [8]:
model0 = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, encoding='utf-8', unicode_errors='ignore')

In [9]:
embed_matrix = tt.FloatTensor(model0.syn0)

  """Entry point for launching an IPython kernel.


In [11]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        # self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding = nn.Embedding.from_pretrained(embed_matrix, freeze=True)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        
        self.fc = nn.Linear(hidden_size * len(kernels), 3)
        
    def forward(self, x, x_lengths):
        
        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.fc(x)
        return x
    
    def predict(self, X):
        xt = tt.from_numpy(X.toarray())
        pred = model.forward(xt)
        pred = tt.softmax(pred, dim=-1)
        pred = pred.detach().numpy()
        predicted_y = np.argmax(pred, axis=1)
        return predicted_y

In [12]:
tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=300,
                hidden_size=128,
                kernels=[2,3,4,5]
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
#     sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
criterion = nn.CrossEntropyLoss()

In [13]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    model.eval()
    with tt.no_grad():
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [14]:
len(train_iterator)

225

In [None]:
# train ...
n_epochs = 100
min_loss = 300
early = 0
for epoch in range(n_epochs):  
    loss_t = 0
    loss_v = 0
    for batch in train_iterator:
        optimizer.zero_grad()
        pred = model.forward(batch.text[0], None)
        loss = criterion(pred, batch.label)
        loss.backward()
        
        optimizer.step()
        loss_t += loss.data.detach().item()
        
    loss_t /= len(train_iterator)
        
    # early stopping
    if loss_t < min_loss:
        min_loss = loss_t
        early = 0
    else:
        early += 1
    if early > 10:
        break
        
    for batch in valid_iterator:
        pred_valid = model.forward(batch.text[0], None)
        loss_v += criterion(pred_valid, batch.label)
    
    print('Epoch [%d/%d], Loss: %.4f'
          %(epoch+1, n_epochs, loss_t))
        

Epoch [1/100], Loss: 0.8452
Epoch [2/100], Loss: 0.7838


In [23]:
model.predict([x.text for x in test])

NameError: name 'model' is not defined

In [22]:
tt.nn.CrossEntropy([x.label for x in test.examples], model.predict([x.text for x in test]))

AttributeError: module 'torch.nn' has no attribute 'CrossEntropy'

In [24]:
[x.text for x in test]

[['pm', 'still', 'bag'],
 ['pls',
  'find',
  'bag',
  'need',
  'get',
  'home',
  'frustrating',
  'hope',
  'bag',
  'somewhere',
  'leaving',
  'jfk',
  'luggage'],
 ['ca', 'speak', 'customer', 'service', 'still', 'hold', 'guys'],
 ['looks',
  'like',
  'already',
  'delayed',
  'miss',
  'connecting',
  'flight',
  'philadelphia',
  'un',
  'delay',
  'flights'],
 ['plane',
  'land',
  'identical',
  'worse',
  'conditions',
  'grk',
  'according',
  'metars'],
 ['omg',
  'bag',
  'enough',
  'shenanigans',
  'already',
  'getting',
  'real',
  'old',
  'real',
  'quick'],
 ['rebooked',
  'split',
  'take',
  'different',
  'flights',
  'pilot',
  'ran',
  'hours',
  'frustrating'],
 ['completely',
  'let',
  'tonight',
  'gate',
  'agents',
  'ambivalence',
  'much',
  'sad',
  'amp'],
 ['found',
  'midnight',
  'last',
  'night',
  'would',
  'think',
  'agent',
  'lax',
  'could',
  'relayed',
  'info',
  'bag',
  'flt'],
 ['stunned',
  'bags',
  'hartford',
  'ord',
  'ground'