In [1]:
# importing packages
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torchtext.data import Field, TabularDataset, Iterator
from torchtext import data
from torchtext import datasets
from eunjeon import Mecab
from model_train_test import main

In [2]:
from konlpy.tag import Okt

In [3]:
# using Mecab as Tokenizer
tokenizer = Okt() 

In [4]:
# defining the Field
TEXT = Field(sequential=True,
             use_vocab=True,
             tokenize=tokenizer.morphs,  
             lower=True, 
             batch_first=True)  
LABEL = Field(sequential=False,  
              use_vocab=False,   
              preprocessing = lambda x: int(x),
              batch_first=True, 
              is_target=True)
ID = Field(sequential=False,  
           use_vocab=False,   
           is_target=False)

In [5]:
# divide train_data and test_data by using TabularDataset.splits function
train_data, test_data = TabularDataset.splits(
    path='./data', format='tsv', 
    train="ratings_train.txt",
    test="ratings_test.txt",
    fields=[('id', ID), ('text', TEXT), ('label', LABEL)],
    skip_header=True)

In [6]:
# making Vocabulary
TEXT.build_vocab(train_data, min_freq=2)
# checking the number of train data and test data / Vocabulary size
print("Train Data: {} / Test Data: {}".format(len(train_data), len(test_data)))
print("Vocab Size: {}".format(len(TEXT.vocab)))

Train Data: 150000 / Test Data: 50000
Vocab Size: 47493


In [7]:
# setting environment variables
BATCH = 64  # mini-batch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # device
STEP = 10  # total repeat step

In [8]:
# defing data loader
train_loader = Iterator(dataset=train_data, batch_size=BATCH, device=DEVICE)
test_loader = Iterator(dataset=test_data, batch_size=BATCH, device=DEVICE)

In [9]:
# makeing model by using LSTM
class SentimentCls(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size,
                 num_layers=3, batch_first=True, bidirec=True, dropout=0.5):
        super(SentimentCls, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = num_layers
        self.n_direct = 2 if bidirec else 1
        self.embedding_layer = nn.Embedding(vocab_size, embed_size)
        self.rnn_layer = nn.LSTM(input_size=embed_size,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 batch_first=batch_first,
                                 bidirectional=bidirec,
                                 dropout=0.5)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.n_direct*hidden_size, output_size)
        

    def forward(self, x):
        embeded = self.dropout(self.embedding_layer(x))
        hidden, cell = self.init_hiddens(x.size(0), self.hidden_size, device=x.device)
        output, (hidden, cell) = self.rnn_layer(embeded, (hidden, cell))
        last_hidden = torch.cat([h for h in hidden[-self.n_direct:]], dim=1)
        scores = self.linear(last_hidden)
        return scores.view(-1)
    
    def init_hiddens(self, batch_size, hidden_size, device):
        hidden = torch.zeros(self.n_direct*self.n_layers, batch_size, hidden_size)
        cell = torch.zeros(self.n_direct*self.n_layers, batch_size, hidden_size)
        return hidden.to(device), cell.to(device)

In [10]:
# setting arguments for model
vocab_size = len(TEXT.vocab)  # the size of vocabulary
embed_size = 128  # the size of embedding
hidden_size = 256  # the size of hidden layer
output_size = 1  # the size of output layer
num_layers = 3  # the number of RNN layer
batch_first = True  # if RNN's frist dim of input is the size of minibatch
bidirec = True  # BERT
dropdout = 0.5
# model
model = SentimentCls(vocab_size, embed_size, hidden_size, output_size,
                     num_layers, batch_first, bidirec, dropdout).to(DEVICE)
# checking total number of parameters
num_params = 0
for params in model.parameters():
    num_params += params.view(-1).size(0)
print("Total number of parameters: {}".format(num_params))

# loss funtion and optimizer
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), weight_decay=5e-5)

Total number of parameters: 10024065


In [11]:
# train
main(model=model,
     train_loader=train_loader,
     test_loader=test_loader,
     loss_func=loss_function, 
     optimizer=optimizer, 
     n_step=STEP,
     save_path="./movie-review-model.pt",
     print_step=256)

Train Step: 1 (00.00%)  	Loss: 0.6896
Train Step: 1 (10.92%)  	Loss: 0.6487
Train Step: 1 (21.85%)  	Loss: 0.5478
Train Step: 1 (32.77%)  	Loss: 0.5578
Train Step: 1 (43.69%)  	Loss: 0.5641
Train Step: 1 (54.61%)  	Loss: 0.3667
Train Step: 1 (65.54%)  	Loss: 0.3815
Train Step: 1 (76.46%)  	Loss: 0.4570
Train Step: 1 (87.38%)  	Loss: 0.3242
Train Step: 1 (98.30%)  	Loss: 0.3207
Test set: Average loss: 0.3915, Accuracy: 41074/50000 (82.15%)

Train Step: 2 (00.00%)  	Loss: 0.3760
Train Step: 2 (10.92%)  	Loss: 0.2927
Train Step: 2 (21.85%)  	Loss: 0.3731
Train Step: 2 (32.77%)  	Loss: 0.5457
Train Step: 2 (43.69%)  	Loss: 0.3988
Train Step: 2 (54.61%)  	Loss: 0.3633
Train Step: 2 (65.54%)  	Loss: 0.2545
Train Step: 2 (76.46%)  	Loss: 0.4523
Train Step: 2 (87.38%)  	Loss: 0.3722
Train Step: 2 (98.30%)  	Loss: 0.3128
Test set: Average loss: 0.3490, Accuracy: 42478/50000 (84.96%)
discard previous state, best model state saved!

Train Step: 3 (00.00%)  	Loss: 0.3427
Train Step: 3 (10.92%)  	L