In [1]:
import pandas as pd

In [2]:
columns = ['id', 'text', 'label']

train_data = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', sep='\t', names=columns, skiprows=1).dropna()
test_data = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', sep='\t', names=columns, skiprows=1).dropna()

In [3]:
train_data.to_csv('C:\\Users\\abc\\jupyter\\pytorch\\Sentiment_analysis\\train_data.csv', index = False)
test_data.to_csv('C:\\Users\\abc\\jupyter\\pytorch\\Sentiment_analysis\\test_data.csv', index = False)

In [4]:
import torch
from torchtext import data

In [6]:
SEED = 1234

torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == 'cuda':
    torch.cuda.manual_seed(SEED)

In [9]:
from eunjeon import Mecab
mecab = Mecab()

In [10]:
# include_lengths : 패딩 된 미니 배치의 튜플과 각 예제의 길이가 포함 된 목록을 반환할지 아니면 패딩 된 미니 배치 만 반환할지 여부
TEXT = data.Field(tokenize=mecab.morphs, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [11]:
fields = {'text':('text',TEXT), 'label':('label',LABEL)}

In [14]:
train_data, test_data = data.TabularDataset.splits(
    path ='C:\\Users\\abc\\jupyter\\pytorch\\Sentiment_analysis',
    train = 'train_data.csv',
    test = 'test_data.csv',
    format = 'csv',
    fields = fields)

In [16]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [17]:
MAX_VOCAB_SIZE = 25000

# fasttext.simple.300d : 사전 훈련된 임베딩 벡터 / unk_itit : 사전훈련된 단어집에 없는 단어는 0으로 처리하는 것을 방지하기 위해
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = 'fasttext.simple.300d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache\wiki.simple.vec: 293MB [00:30, 9.72MB/s]                                                                 
  0%|                                                                                       | 0/111051 [00:00<?, ?it/s]Skipping token b'111051' with 1-dimensional vector [b'300']; likely a header
100%|███████████████████████████████████████████████████████████████████████| 111051/111051 [00:10<00:00, 10238.91it/s]


In [18]:
len(TEXT.vocab)

25002

In [19]:
LABEL.vocab.stoi

defaultdict(None, {'0': 0, '1': 1})

In [20]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x0000023B8E7F2460>>,
            {'<unk>': 0,
             '<pad>': 1,
             '.': 2,
             '이': 3,
             '는': 4,
             '영화': 5,
             '다': 6,
             '고': 7,
             '하': 8,
             '도': 9,
             '의': 10,
             '가': 11,
             '은': 12,
             '에': 13,
             '을': 14,
             '한': 15,
             '보': 16,
             '..': 17,
             '게': 18,
             ',': 19,
             '들': 20,
             '!': 21,
             '지': 22,
             '를': 23,
             '있': 24,
             '없': 25,
             '?': 26,
             '좋': 27,
             '나': 28,
             '었': 29,
             '만': 30,
             '는데': 31,
             '너무': 32,
             '적': 33,
             '봤': 34,
             '안': 35,
             '로': 36,
             '정말': 37,
             '음': 38,
             '으로': 39,
       

In [22]:
vars(train_data.examples[0])

{'text': ['야한', '장면', '기다리', '는', '것', '도', '곤욕', '이', '군'], 'label': '0'}

In [23]:
for i in range(len(train_data)):
    if len(train_data.examples[i].text) == 0 : print(i)

In [24]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device,
    sort_key = lambda x : len(x.text), # 오류방지
    sort_within_batch = True # 길이에 따라 정렬
)

In [27]:
next(iter(train_iterator)).text

(tensor([[ 1559,   742,    47,   141,    72,    77,   179,   135,    62,   227,
            136,   766,  6931,  3413,   350,    37,   147,   631,  1390,    62,
             41,   837,  3662,    58,    92,    37,   336,  4693,   385,    81,
             92,   104,    37,   394,  3871,   551,    41,     0,  3395,   636,
            634,    63,  2183, 19671,    98,   163,    37,    43,    27,  1666,
            536,  1649,   613,  1223,  3097,    32,  1074,   164,    79,    68,
            309,   446,  1017,    27],
         [    9,   621,    65,    95,     0,    45,  2567,    13,    78,   104,
              4,    21,   164, 18416,    45,   107,   215,  2011,     3,     3,
              2,    61,    36,    47,    33,   107,    10,   359,  6236,   103,
             33,     3,   215,    12,     8,   797,  1111,    36,    44,   319,
            568,   492,   134, 12453,  2238,    47,    65,    42,    12,    10,
              5,  4503,    43,   437, 16104,   272,     5,    80,     9,    11,
 

In [28]:
import torch.nn as nn

- num_embeddings : 임베딩을 할 단어들의 개수. 다시 말해 단어 집합의 크기입니다.
- embedding_dim : 임베딩 할 벡터의 차원입니다. 사용자가 정해주는 하이퍼파라미터입니다.

In [32]:
emb = nn.Embedding(3, 5, padding_idx = 1)
test = torch.tensor([0,1,2])

In [33]:
emb(test) # padding_idx에 해당하는 벡터는 0

tensor([[-0.3990,  0.9585, -0.1512,  1.2120, -1.1036],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.1892, -0.3563,  1.2588,  0.5190,  0.8902]],
       grad_fn=<EmbeddingBackward>)

In [34]:
def print_shape(name, data):
    print(f'{name} has shape {data.shape}')

In [40]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout = dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hideen, cell) = self.rnn(packed_embedded)
        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        res = self.fc(hidden)
        return res

In [38]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [41]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [44]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'이 모델은 {count_parameters(model):,}개의 파라미터를 가지고 있다.')

이 모델은 10,220,857개의 파라미터를 가지고 있다.


In [45]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 300])


In [46]:
model.embedding.weight.data.shape

torch.Size([25002, 300])

In [47]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ..., -1.4447,  0.8402, -0.8668],
        [ 0.1032, -1.6268,  0.5729,  ...,  0.3180, -0.1626, -0.0417],
        [ 0.0569, -0.0520,  0.2733,  ..., -0.0695, -0.1606, -0.0989],
        ...,
        [-1.2905,  0.2440, -0.3436,  ..., -0.2952, -0.0081,  1.0907],
        [-1.8321, -0.9096, -0.9873,  ..., -2.3504, -0.1641, -1.5356],
        [ 0.1700, -0.8185, -0.7213,  ..., -1.5704, -0.5734,  0.7970]])

In [48]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
UNK_IDX, PAD_IDX

(0, 1)

In [49]:
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0569, -0.0520,  0.2733,  ..., -0.0695, -0.1606, -0.0989],
        ...,
        [-1.2905,  0.2440, -0.3436,  ..., -0.2952, -0.0081,  1.0907],
        [-1.8321, -0.9096, -0.9873,  ..., -2.3504, -0.1641, -1.5356],
        [ 0.1700, -0.8185, -0.7213,  ..., -1.5704, -0.5734,  0.7970]])


In [50]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [51]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))