# Prepare

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import WordPunctTokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split, ConcatDataset

from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import BucketIterator

In [2]:
TRAIN_FILE = '../data/processed_train.csv'
TEST_FILE = '../data/processed_test.csv'

train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)
word_tokenizer = WordPunctTokenizer()

In [3]:
text_field = data.Field(lower=False, tokenize=word_tokenizer.tokenize, batch_first=True)
sentiment_field = data.Field(sequential=False, use_vocab=False)

In [4]:
class CustomDataset(data.Dataset):
    
    def __init__(self, text_field, sentiment_field, pd_data, **kwargs):
        fields = [('id', None), ('sentiment', sentiment_field), ('text', text_field)]
        
        examples = []
        
        def sentimentMapping(sentiment):
            if sentiment == 'negative':
                return 0
            elif sentiment == 'neutral':
                return 1
            elif sentiment == 'positive':
                return 2
            
        for i in range(len(pd_data)):
            e = pd_data.iloc[i]
            examples.append(data.Example.fromlist([e['id'], sentimentMapping(e['sentiment']), e['text']], fields))
            
        
        super(CustomDataset, self).__init__(examples, fields, **kwargs)
        
    @classmethod
    def splits(cls, text_field, sentiment_field, pd_data_1, pd_data_2, **kwargs):
        return (cls(text_field, sentiment_field, pd_data_1),
               cls(text_field, sentiment_field, pd_data_2))

In [5]:
train_dataset, test_dataset = CustomDataset.splits(text_field, sentiment_field, train_data, test_data)

In [6]:
cache = '../data/vector_cache'

if not os.path.exists(cache):
    os.mkdir(cache)
    
glove_vectors = Vectors(name='glove.840B.300d.txt', cache=cache)

In [7]:
def myUniform(w):
    return nn.init.uniform_(w, a=-0.25, b=0.25)

glove_vectors.unk_init = myUniform

In [8]:
text_field.build_vocab(train_dataset, test_dataset, vectors=glove_vectors)

In [9]:
def getDataIter(dataset_1, dataset_2, sort_within_batch):
    
    iter_1 = BucketIterator(dataset_1, 
                                batch_size=batch_size, 
                                sort_key=lambda x: len(x.text), 
                                device=device, 
                                sort=False,
                                sort_within_batch=sort_within_batch, 
                                repeat=False,
                                train=True)

    iter_2 = BucketIterator(dataset_2, 
                                batch_size=batch_size, 
                                sort_key=lambda x: len(x.text), 
                                device=device, 
                                sort=False,
                                sort_within_batch=sort_within_batch, 
                                repeat=False,
                                train=False)
    
    return iter_1, iter_2

# TextCNN

In [119]:
class TextCNN(nn.Module):
    def __init__(self, text_field, kernel_num, kernel_sizes, dropout):
        
        super(TextCNN, self).__init__()
        
        vocab_size = text_field.vocab.vectors.size()[0]
        embed_dim = text_field.vocab.vectors.size()[1]
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # self.embedding.weight.data.copy_(text_field.vocab.vectors)
        
        self.convs = nn.ModuleList(
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=kernel_num,
                kernel_size=kernel_size
            ) for kernel_size in kernel_sizes

        )
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(len(kernel_sizes) * kernel_num, 3)
        
    def forward(self, text):
        text = self.embedding(text).transpose(1, 2)
        
        texts = [conv(text) for conv in self.convs]
        texts = [F.max_pool1d(t, t.size(2)).squeeze(2) for t in texts]
        
        text = torch.cat(texts, dim=1)
        
        text = self.dropout(text)
        
        text = self.fc(F.sigmoid(text))
        
        return text

In [138]:
lr = 1e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 5
batch_size = 16
kernel_num = 50
kernel_sizes = [2, 2, 2, 2, 3, 3, 3]
dropout = 0.3

device = torch.device("cuda:1")
log_interval = 100

In [139]:
train_iter, test_iter = getDataIter(train_dataset, test_dataset, True)

def find_min_len(title, iterator):
    print(title)
    minLenOfText = 1000
    for i in iterator:
        if len(i.text.size()) == 1:
            minLenOfText = 1
        else:
            minLenOfText = i.text.size()[1] if i.text.size()[1] < minLenOfText else minLenOfText
            
    print('Min length of text: {}'.format(minLenOfText))
    
find_min_len('TRAIN: ', train_iter)
find_min_len('TEST: ', test_iter)

TRAIN: 
Min length of text: 3
TEST: 
Min length of text: 3


In [140]:
def weight_bias_reset(model, use_glove):

    for m in clf_CNN.modules():
        if isinstance(m, nn.Linear):
            mean, std = 0, 0.01 
            nn.init.normal_(m.weight, mean, std)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Conv1d):
            mean, std = 0, 0.01 
            nn.init.normal_(m.weight, mean, std)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Embedding):
            if not use_glove:
                myUniform(m.weight)

In [141]:
clf_CNN = TextCNN(text_field, kernel_num, kernel_sizes, dropout)
clf_CNN.to(device)
weight_bias_reset(clf_CNN, False)
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(clf_CNN.parameters(), lr=lr, weight_decay=weight_decay)

In [142]:
def train(train_iter, model, loss_fn, optimizer):
    
    steps = 0
    
    model.train()
    
    for batch in train_iter:
        text, sentiment = batch.text, batch.sentiment
        
        optimizer.zero_grad()
        outputs = model(text)
        _, predicts = outputs.max(dim=1)
        
        loss = loss_fn(outputs, sentiment)
        loss.backward()
        
        # nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        
        steps += 1
        
        if steps % log_interval == 0:
            corrects = (predicts == sentiment).sum()
            acc = 100.0 * float(corrects) / float(batch.batch_size)
            print('\rTrain: Batch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps,
                                                                                 loss.item(), 
                                                                                 acc,
                                                                                 corrects,
                                                                                 batch.batch_size))

In [143]:
def val(val_iter, model, loss_fn):
    model.eval()
    
    total_loss = 0
    corrects = 0
    
    with torch.no_grad():
        for batch in val_iter:
            text, sentiment = batch.text, batch.sentiment
            
            outputs = model(text)
            _, predicts = outputs.max(dim=1)
            
            loss = loss_fn(outputs, sentiment)
            total_loss += loss.item()
            
            corrects += (predicts == sentiment).sum()
    
    avg_loss = total_loss / len(val_iter)
    acc = 100.0 * float(corrects) / float(len(val_iter.dataset))
    
    return avg_loss, acc

In [144]:
def fit(model, loss_fn, optimizer, num_epochs, cross_val_fold, sort_within_batch):
    
    valset_len = len(train_data) // cross_val_fold
    splitedsets = random_split(train_data, tuple([valset_len] * (cross_val_fold - 1) + [len(train_data) - valset_len * (cross_val_fold - 1)]))
    
    train_accs = []
    train_losses = []
    
    val_accs = []
    val_losses = []
    
    test_accs = []
    test_losses = []
    
    for fid in range(cross_val_fold):
        
        train_indices = []
        for i, x in enumerate(splitedsets):
            if i != fid:
                train_indices.extend(x.indices.numpy())
        
        val_indices = splitedsets[fid].indices
        
        train_dataset, val_dataset = CustomDataset.splits(text_field, sentiment_field, 
                                                         train_data.iloc[train_indices], train_data.iloc[val_indices])
        
        print(f'\nFold:{fid}')
        
        train_iter, val_iter = getDataIter(train_dataset, val_dataset, sort_within_batch)
        
        for epoch in range(num_epochs):
            print(f'\rEpoch: {epoch}')
            
            train(train_iter, model, loss_fn, optimizer)
            train_loss, train_acc = val(train_iter, model, loss_fn)
            train_losses.append(train_loss)
            train_accs.append(train_acc)
            
            # --------------------------------------
            
            val_loss, val_acc = val(val_iter, model, loss_fn)
            val_losses.append(val_loss)
            val_accs.append(val_acc)
            
            print('\rEvaluation: loss: {:.6f} acc: {:.4f}%'.format(val_loss, val_acc))
            
            # --------------------------------------
            
            test_loss, test_acc = val(test_iter, model, loss_fn)
            test_losses.append(test_loss)
            test_accs.append(test_acc)
            
            print('\rTest: loss: {:.6f} acc: {:.4f}%'.format(test_loss, test_acc))
            
    return train_accs, train_losses, val_accs, val_losses, test_accs, test_losses
            
            

In [145]:
def show_curve(ys, title):
    x = np.array(range(len(ys)))
    y = np.array(ys)
    plt.plot(x, y, c='b')
    plt.axis()
    
    plt.title('{} Curve:'.format(title))
    plt.xlabel('Epoch')
    plt.ylabel('{} Value'.format(title))
    plt.show() 

In [146]:
train_accs, train_losses, val_accs, val_losses, test_accs, test_losses = fit(clf_CNN, loss_fn, optimizer, num_epochs, cross_val_fold, True)


Fold:0
Epoch: 0
Train: Batch[100] - loss: 1.167614 acc: 37.5000%(6/16)
Train: Batch[200] - loss: 1.065162 acc: 37.5000%(6/16)
Train: Batch[300] - loss: 0.584525 acc: 81.2500%(13/16)
Train: Batch[400] - loss: 1.017393 acc: 68.7500%(11/16)
Train: Batch[500] - loss: 0.539509 acc: 87.5000%(14/16)
Evaluation: loss: 0.572919 acc: 76.9001%
Test: loss: 0.576430 acc: 77.1516%
Epoch: 1
Train: Batch[100] - loss: 0.501204 acc: 81.2500%(13/16)
Train: Batch[200] - loss: 0.516090 acc: 81.2500%(13/16)
Train: Batch[300] - loss: 0.520114 acc: 75.0000%(12/16)
Train: Batch[400] - loss: 0.311539 acc: 87.5000%(14/16)
Train: Batch[500] - loss: 0.244083 acc: 87.5000%(14/16)
Evaluation: loss: 0.531991 acc: 79.7182%
Test: loss: 0.553216 acc: 78.4153%

Fold:1
Epoch: 0
Train: Batch[100] - loss: 0.352532 acc: 93.7500%(15/16)
Train: Batch[200] - loss: 0.537588 acc: 81.2500%(13/16)
Train: Batch[300] - loss: 0.301353 acc: 87.5000%(14/16)
Train: Batch[400] - loss: 0.194670 acc: 93.7500%(15/16)
Train: Batch[500] - los

KeyboardInterrupt: 

with Glove 0.81 ~ 0.82
```python
lr = 1e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 5
batch_size = 32
kernel_num = 50
kernel_sizes = [2, 2, 2, 2, 3, 3, 3]
dropout = 0.3
```
without GloVe 0.79 ~ 0.795
```python
lr = 1e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 5
batch_size = 16
kernel_num = 50
kernel_sizes = [2, 2, 2, 2, 3, 3, 3]
dropout = 0.3
```

# LSTM

In [377]:
class TextLSTM(nn.Module):
    def __init__(self, text_field, hidden_size, num_layers=1, dropout=0.3, bidirectional=False):
        super(TextLSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1
        
        vocab_size = text_field.vocab.vectors.size()[0]
        embed_dim = text_field.vocab.vectors.size()[1]
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # self.embedding.weight.data.copy_(text_field.vocab.vectors)
        
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, bidirectional=bidirectional, dropout=dropout)
        
        self.h2o = nn.Linear(self.num_directions * hidden_size, 3)
        
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, text):
        
        hidden, cell = self.init_hidden(text.shape[0])
        # text: (batch_size, seq_len)
        text = self.embedding(text).transpose(0, 1)
        # text: (seq_len, batch_size, embed_dim)
        outputs, (hidden, cell) = self.lstm(text, (hidden, cell))

        outputs = self.h2o(outputs[text.size(0)-1])
        outputs = self.softmax(outputs)
        
        return outputs
        
        
    def init_hidden(self, batch_size, is_cuda=True):
        if is_cuda:
            cell = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
            hidden = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
        else:
            cell = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)
            hidden = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)
        
        return hidden, cell
        

In [383]:
lr = 2e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 3
batch_size = 16
hidden_size = 200
num_layers = 2
dropout = 0.5
bidirectional = True

device = torch.device("cuda:1")
log_interval = 100

In [384]:
train_iter, test_iter = getDataIter(train_dataset, test_dataset, True)

In [385]:
clf_LSTM = TextLSTM(text_field, hidden_size, num_layers, dropout, bidirectional)
clf_LSTM.to(device)
weight_bias_reset(clf_LSTM, False)
loss_fn = nn.NLLLoss()

optimizer = torch.optim.Adam(clf_LSTM.parameters(), lr=lr, weight_decay=weight_decay)

In [386]:
train_accs, train_losses, val_accs, val_losses, test_accs, test_losses = fit(clf_LSTM, loss_fn, optimizer, num_epochs, cross_val_fold, True)


Fold:0
Epoch: 0
Train: Batch[100] - loss: 0.736431 acc: 75.0000%(12/16)
Train: Batch[200] - loss: 0.512340 acc: 81.2500%(13/16)
Train: Batch[300] - loss: 1.263847 acc: 50.0000%(8/16)
Train: Batch[400] - loss: 0.572216 acc: 87.5000%(14/16)
Evaluation: loss: 0.587934 acc: 78.0994%
Test: loss: 0.638417 acc: 75.6831%
Epoch: 1
Train: Batch[100] - loss: 0.256647 acc: 93.7500%(15/16)
Train: Batch[200] - loss: 0.284155 acc: 100.0000%(16/16)
Train: Batch[300] - loss: 0.725111 acc: 68.7500%(11/16)
Train: Batch[400] - loss: 0.284908 acc: 93.7500%(15/16)
Evaluation: loss: 0.528131 acc: 79.6619%
Test: loss: 0.588631 acc: 77.1516%

Fold:1
Epoch: 0
Train: Batch[100] - loss: 0.651106 acc: 81.2500%(13/16)
Train: Batch[200] - loss: 0.134523 acc: 100.0000%(16/16)
Train: Batch[300] - loss: 0.322426 acc: 87.5000%(14/16)
Train: Batch[400] - loss: 0.319292 acc: 93.7500%(15/16)
Evaluation: loss: 0.310634 acc: 88.6527%
Test: loss: 0.548495 acc: 78.4495%
Epoch: 1
Train: Batch[100] - loss: 0.308281 acc: 93.7500

with GloVe 0.81 ~ 0.82
```python
lr = 1e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 3
batch_size = 32
hidden_size = 200
num_layers = 2
dropout = 0.5
bidirectional = True

device = torch.device("cuda:1")
log_interval = 100
```
without Glove 0.79 ~ 0.80
```python
lr = 2e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 3
batch_size = 16
hidden_size = 200
num_layers = 2
dropout = 0.5
bidirectional = True

device = torch.device("cuda:1")
log_interval = 100
```

# GRU

In [338]:
class TextGRU(nn.Module):
    def __init__(self, text_field, hidden_size, num_layers=1, dropout=0.3, bidirectional=False):
        super(TextGRU, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1
        
        vocab_size = text_field.vocab.vectors.size()[0]
        embed_dim = text_field.vocab.vectors.size()[1]
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # self.embedding.weight.data.copy_(text_field.vocab.vectors)
        
        self.gru = nn.GRU(embed_dim, hidden_size, num_layers, bidirectional=bidirectional, dropout=dropout)
        
        self.h2o = nn.Linear(self.num_directions * hidden_size, 3)
        
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, text):
        
        hidden = self.init_hidden(text.shape[0])
        # text: (batch_size, seq_len)
        text = self.embedding(text).transpose(0, 1)
        # text: (seq_len, batch_size, embed_dim)
        outputs, hidden = self.gru(text, hidden)

        outputs = self.h2o(outputs[text.size(0)-1])
        outputs = self.softmax(outputs)
        
        return outputs
        
        
    def init_hidden(self, batch_size, is_cuda=True):
        if is_cuda:
            hidden = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
        else:
            hidden = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)
        
        return hidden
        

In [363]:
lr = 2e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 3
batch_size = 16
hidden_size = 200
num_layers = 2
dropout = 0.5
bidirectional = True

device = torch.device("cuda:1")
log_interval = 100

In [364]:
train_iter, test_iter = getDataIter(train_dataset, test_dataset, True)

In [365]:
clf_GRU = TextGRU(text_field, hidden_size, num_layers, dropout, bidirectional)
clf_GRU.to(device)
weight_bias_reset(clf_GRU, False)
loss_fn = nn.NLLLoss()

optimizer = torch.optim.Adam(clf_GRU.parameters(), lr=lr, weight_decay=weight_decay)

In [366]:
train_accs, train_losses, val_accs, val_losses, test_accs, test_losses = fit(clf_GRU, loss_fn, optimizer, num_epochs, cross_val_fold, True)


Fold:0
Epoch: 0
Train: Batch[100] - loss: 1.014788 acc: 56.2500%(9/16)
Train: Batch[200] - loss: 0.604122 acc: 75.0000%(12/16)
Train: Batch[300] - loss: 0.811116 acc: 50.0000%(8/16)
Train: Batch[400] - loss: 0.550559 acc: 75.0000%(12/16)
Evaluation: loss: 0.548621 acc: 78.7141%
Test: loss: 0.573604 acc: 78.1762%
Epoch: 1
Train: Batch[100] - loss: 0.402460 acc: 81.2500%(13/16)
Train: Batch[200] - loss: 0.213041 acc: 87.5000%(14/16)
Train: Batch[300] - loss: 0.536091 acc: 81.2500%(13/16)
Train: Batch[400] - loss: 0.569178 acc: 87.5000%(14/16)
Evaluation: loss: 0.546827 acc: 79.7643%
Test: loss: 0.599196 acc: 77.6639%

Fold:1
Epoch: 0
Train: Batch[100] - loss: 0.666935 acc: 75.0000%(12/16)
Train: Batch[200] - loss: 0.399074 acc: 87.5000%(14/16)
Train: Batch[300] - loss: 0.659347 acc: 62.5000%(10/16)
Train: Batch[400] - loss: 0.122752 acc: 93.7500%(15/16)
Evaluation: loss: 0.278966 acc: 90.2408%
Test: loss: 0.542294 acc: 79.2691%
Epoch: 1
Train: Batch[100] - loss: 0.207673 acc: 93.7500%(1

KeyboardInterrupt: 

with GloVe 0.82
```python
lr = 1e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 3
batch_size = 32
hidden_size = 200
num_layers = 2
dropout = 0.5
bidirectional = True

device = torch.device("cuda:1")
log_interval = 100
```
without Glove 0.79 ~ 0.80
```python
lr = 2e-3
weight_decay = 0

num_epochs = 2
cross_val_fold = 3
batch_size = 16
hidden_size = 200
num_layers = 2
dropout = 0.5
bidirectional = True

device = torch.device("cuda:1")
log_interval = 100
```