# VHEX.Tech-NLP
본 노트북은 VHEX팀의 자연어처리 및 관련 자료들을 개발하기 위한 공간입니다. 마크다운 형식이라면 어떤 스타일이든지 가능하며, 필수적으로-
1. 작성일시
2. 작성자 정보(팀)
3. 작성(추가/변경/삭제)내용
의 세 가지는 꼭 기입하셔서 버전 관리를 용이하게 할 수 있도록 해주시면 좋겠습니다

## 1. 작성일시
2021.05.31
## 2. 작성자 정보(팀)
한승현(VHEX.Tech)
## 3. 작성내용
Binary sentiment anaysis for Korean Documents using Transformers

### 3.1. dependancy
1. OS : windows 10
2. Languages : Python 3.7.**
3. External modules : see dependency.txt
4. External dataset : https://github.com/e9t/nsmc

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

In [2]:
pwd

'D:\\RAPA\\VHEX-Tech-NLP\\notebooks'

In [3]:
#load dataset
train_df = pd.read_csv('../dataset/ratings_train.txt', sep='\t')
test_df = pd.read_csv('../dataset/ratings_test.txt', sep='\t')    

train_df = train_df.sample(frac=0.4, random_state=999)
test_df = test_df.sample(frac=0.4, random_state=999)

In [4]:
class Document(Dataset):
    ''' Naver Sentiment Movie Corpus Dataset 
        initializer : data frame
        length : length of each sentence(어절단위)
        get itemized tokens :
        text : 1st frame
        label : 1: positive expression 
                2: negative expression
    '''
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label

In [10]:
train_dataset = Document(train_df)
#on windows environment, num_workers should be 0, other env, try 2
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=0)

In [11]:
device = torch.device("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
epochs = 1
total_loss = 0
total_len = 0
total_correct = 0


model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1



[Epoch 1/1] Iteration 500 -> Train Loss: 0.7081, Accuracy: 0.490
[Epoch 1/1] Iteration 1000 -> Train Loss: 0.6975, Accuracy: 0.504
[Epoch 1/1] Iteration 1500 -> Train Loss: 0.6950, Accuracy: 0.503
[Epoch 1/1] Iteration 2000 -> Train Loss: 0.6978, Accuracy: 0.482


In [None]:
# evaluation
model.eval()

eval_dataset = Document(test_df)
eval_loader = DataLoader(eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)