# RNN을 통해 감성분류기 만들어보기

IMDB 데이터셋을 이용한다.

# IMDB 데이터셋 다운로드 및 Tonken화하기

In [1]:
from torchtext import data

TEXT = data.Field(lower=True, batch_first=True, fix_length=40) 
# 모든 텍스트를 소문자로 만들고 배치 형태로 처리하고 총 텍스트의 길이는 40이다.
LABEL = data.Field(sequential=False)
# 데이터 타입이 Sequential한 형태를 갖도록 한다.

In [2]:
from torchtext import datasets

train, test = datasets.IMDB.splits(TEXT, LABEL) # 현재 프로젝트 폴더에 .data 폴더를 만들고 데이터셋을 그 곳에 다운로드한다.
# 이미 다운로드를 받았다면 다음번에 코드를 실행할 때 다시 다운 받지 않는다.

downloading aclImdb_v1.tar.gz


.data\imdb\aclImdb_v1.tar.gz: 100%|███████████████████████████████████████████████| 84.1M/84.1M [00:45<00:00, 1.86MB/s]


# Vocab 만들기

In [4]:
from torchtext.vocab import GloVe

TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
LABEL.build_vocab(train)

.vector_cache\glove.6B.zip: 862MB [07:31, 1.91MB/s]                                                                    
100%|███████████████████████████████████████████████████████████████████████▊| 399272/400000 [00:46<00:00, 9233.73it/s]

# 배치 처리 만들기

In [5]:
train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_size=32, device=-1, shuffle=True)



In [6]:
batch = next(iter(train_iter))

# RNN 모델 만들기

In [160]:
import torch.nn as nn
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden): # 여기서 hidden은 상태 벡터를 의미함. -> RNN은 각 층에서 input데이터와 상태 벡터. 총 두 개가 입력된다.
        combined = torch.cat((input, hidden.type(torch.cuda.LongTensor)), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return Variable(torch.zeros([1, self.hidden_size]))

In [161]:
n_vocab = len(TEXT.vocab)
n_hidden = 100

In [162]:
len(train_iter.dataset)

25000

In [163]:
from torch import optim

model = RNN(n_vocab, n_hidden, 3)
model = model.cuda()

optimizer = optim.Adam(model.parameters(),lr=1e-3)

# 모델 학습시키기

In [164]:
import torch

def fit(epoch,model,data_loader,phase='training',volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True
        
    running_loss = 0.0
    running_correct = 0
    
    hidden = model.initHidden().cuda()
    
    for batch_idx , batch in enumerate(data_loader):
        text , target = batch.text , batch.label
        if torch.cuda.is_available():
            text,target = text.cuda(), target.cuda()
            
        if phase == 'training':
            optimizer.zero_grad()
        output, hidden = model(text, hidden)
        loss = F.nll_loss(output,target)
        
        running_loss += F.nll_loss(output,target,size_average=False).data
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct.item()/len(data_loader.dataset)
    
    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
    return loss,accuracy

In [165]:
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

In [166]:
%%time

for epoch in range(1,5):

    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 1. Got 1 and 32 in dimension 0 at C:/w/1/s/tmp_conda_3.8_075429/conda/conda-bld/pytorch_1579852542185/work/aten/src\THC/generic/THCTensorMath.cu:71

In [None]:
# RNN이 학습 안 됐는데.... 이거 추후에 다시 해결해보자...