# 데이터 전처리 부분

In [1]:
import pandas as pd

In [2]:
columns = ['id', 'text', 'label']

train_data = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', sep='\t', names=columns, skiprows=1).dropna()
test_data = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', sep='\t', names=columns, skiprows=1).dropna()

In [3]:
train_data.to_csv('C:\\Users\\admin\\jupyter\\pytorch\\Sentiment_analysis\\train_data.csv', index=False)
test_data.to_csv('C:\\Users\\admin\\jupyter\\pytorch\\Sentiment_analysis\\test_data.csv', index=False)

In [4]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np
from eunjeon import Mecab

mecab = Mecab()

SEED = 1234

torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == 'cuda':
    torch.cuda.manual_seed_all(SEED)

### 토큰길이가 필터보다 작으면 오류가 발생하여 방지하기 위해

In [5]:
FILTER_SIZES = [3,4,5]
def tokenizer(text):
    token = [t for t in mecab.morphs(text)]
    if len(token) < max(FILTER_SIZES):
        for i in range(0, max(FILTER_SIZES) - len(token)):
            token.append('<PAD>')
    return token

- RNN은 [sent_len, batch_size, embedding_dim] 크기의 텐서를 입력으로 받음
- CNN은 [batch_size, sent_len, embedding_dim] 크기의 텐서를 입력으로 받음

In [6]:
# batch_size를 처음인자로 설정하기 위해 batch_first = True설정
TEXT = data.Field(tokenize=tokenizer, batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

In [7]:
fields = {'text':('text',TEXT), 'label':('label',LABEL)}

In [8]:
train_data, test_data = data.TabularDataset.splits(
    path='C:\\Users\\admin\\jupyter\\pytorch\\Sentiment_analysis',
    train='train_data.csv',
    test='test_data.csv',
    format='csv',
    fields=fields)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [9]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors='fasttext.simple.300d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [10]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    device = device,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True)

# 모델 구현

In [11]:
import torch.nn as nn
import torch.nn.functional as F

In [12]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        # nn.Conv2d(입력 이미지 채널 수, 생성된 채널 수, 커널의 크기, 스트라이드(기본1), 패딩 값(기본0))   
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1,
                                             out_channels=n_filters,
                                             kernel_size=(fs, embedding_dim))
                                   for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        # max_pool1d : 여러 입력 평면으로 구성된 입력에 대해 1D최대 풀링 적용
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # cat : 텐서 연결
        cat = self.dropout(torch.cat(pooled, dim=1))
        res = self.fc(cat)
        return res

In [13]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [14]:
model = model.to(device)

In [15]:
inp = next(iter(train_iterator))
model(inp.text)

tensor([[-3.3703e-01],
        [ 7.5480e-01],
        [ 7.4635e-01],
        [ 6.9956e-01],
        [ 1.2982e-01],
        [ 5.2134e-01],
        [-7.9345e-02],
        [ 6.8630e-01],
        [ 3.9320e-03],
        [ 6.3400e-01],
        [ 8.3107e-02],
        [ 2.6099e-01],
        [ 5.0337e-01],
        [ 7.5231e-01],
        [ 9.2446e-01],
        [ 1.0580e+00],
        [ 9.2467e-02],
        [-4.0556e-01],
        [ 5.6146e-01],
        [ 2.3354e-01],
        [ 1.1528e+00],
        [ 9.4831e-02],
        [ 1.3081e+00],
        [ 1.1055e+00],
        [ 2.6806e-01],
        [ 6.4091e-01],
        [ 7.4302e-02],
        [ 1.0585e+00],
        [ 5.9186e-01],
        [-1.3543e-03],
        [ 1.7510e-01],
        [ 1.3231e+00],
        [ 5.9599e-01],
        [-7.2571e-01],
        [-8.1062e-01],
        [-7.1240e-01],
        [ 1.6391e+00],
        [ 7.5234e-01],
        [ 2.1603e-01],
        [ 3.0548e-01],
        [-1.7243e-01],
        [ 8.2147e-01],
        [-7.3995e-01],
        [ 1

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'모델의 파라미터 수는 {count_parameters(model):,} 개 입니다.')

모델의 파라미터 수는 7,861,201 개 입니다.


In [17]:
pretrain_weight = TEXT.vocab.vectors
print(pretrain_weight.shape, model.embedding.weight.data.shape)

torch.Size([25002, 300]) torch.Size([25002, 300])


In [18]:
model.embedding.weight.data.copy_(pretrain_weight)

tensor([[-0.1117, -0.4966,  0.1631,  ..., -1.4447,  0.8402, -0.8668],
        [ 0.1032, -1.6268,  0.5729,  ...,  0.3180, -0.1626, -0.0417],
        [ 0.0569, -0.0520,  0.2733,  ..., -0.0695, -0.1606, -0.0989],
        ...,
        [-1.2905,  0.2440, -0.3436,  ..., -0.2952, -0.0081,  1.0907],
        [-1.8321, -0.9096, -0.9873,  ..., -2.3504, -0.1641, -1.5356],
        [ 0.1700, -0.8185, -0.7213,  ..., -1.5704, -0.5734,  0.7970]],
       device='cuda:0')

In [19]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# 모델 훈련, 검증 함수 부분

In [20]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [21]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [22]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [23]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_loss / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_loss / len(iterator)

In [25]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%') 

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.450 | Train Acc: 45.03%
	 Val. Loss: 0.374 |  Val. Acc: 37.44%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.337 | Train Acc: 33.69%
	 Val. Loss: 0.341 |  Val. Acc: 34.14%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.285 | Train Acc: 28.49%
	 Val. Loss: 0.340 |  Val. Acc: 34.02%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.244 | Train Acc: 24.40%
	 Val. Loss: 0.358 |  Val. Acc: 35.85%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 0.207 | Train Acc: 20.65%
	 Val. Loss: 0.380 |  Val. Acc: 37.98%


In [27]:
model.load_state_dict(torch.load('tut4-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.346 | Test Acc: 34.60%


In [28]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%') 

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.243 | Train Acc: 24.32%
	 Val. Loss: 0.366 |  Val. Acc: 36.58%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.207 | Train Acc: 20.73%
	 Val. Loss: 0.387 |  Val. Acc: 38.73%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.175 | Train Acc: 17.46%
	 Val. Loss: 0.427 |  Val. Acc: 42.66%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.152 | Train Acc: 15.15%
	 Val. Loss: 0.490 |  Val. Acc: 49.05%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 0.131 | Train Acc: 13.06%
	 Val. Loss: 0.509 |  Val. Acc: 50.95%


In [29]:
model.load_state_dict(torch.load('tut4-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.372 | Test Acc: 37.17%


In [32]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%') 

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.072 | Train Acc: 7.24%
	 Val. Loss: 0.876 |  Val. Acc: 87.57%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.069 | Train Acc: 6.88%
	 Val. Loss: 0.924 |  Val. Acc: 92.43%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.068 | Train Acc: 6.83%
	 Val. Loss: 0.942 |  Val. Acc: 94.17%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.064 | Train Acc: 6.44%
	 Val. Loss: 0.961 |  Val. Acc: 96.14%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 0.064 | Train Acc: 6.36%
	 Val. Loss: 1.009 |  Val. Acc: 100.87%


In [33]:
model.load_state_dict(torch.load('tut4-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.899 | Test Acc: 89.86%
