In [None]:
!pip install transformers==4.7 torchinfo

In [None]:
!git clone https://github.com/Taeksu-Kim/Transformer.git

In [None]:
cd Transformer/PyTorch

In [None]:
# 데이터 다운로드
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

In [5]:
# common
import math
import random
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from torchinfo import summary

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from transformers import AutoTokenizer

# custom
from transformer import Transformer, TransformerEncoder, get_attn_pad_mask

In [6]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
    torch.cuda.manual_seed_all(seed)

seed = 42

seed_everything(seed)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

epochs = 300
learning_rate = 1e-4
weight_decay = 1e-2
batch_size = 64

early_stopping_patience = 10

save_name = 'NSMC_clssifier_model'

In [8]:
model_path = "monologg/kobigbird-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)

Downloading:   0%|          | 0.00/870 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/241k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/169 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/373 [00:00<?, ?B/s]

In [9]:
train_df = pd.read_csv('ratings_train.txt', sep="\t", engine="python")

In [10]:
train_df.isna().sum()

id          0
document    5
label       0
dtype: int64

In [11]:
train_df = train_df.dropna()

In [12]:
test_df = pd.read_csv('ratings_test.txt', sep="\t", engine="python")

In [13]:
test_df.isna().sum()

id          0
document    3
label       0
dtype: int64

In [14]:
test_df = test_df.dropna()

In [15]:
train_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [16]:
tokenizer.tokenize('너무재밓었다그래서보는것을추천한다')

['[UNK]']

In [None]:
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git

In [18]:
from pykospacing import Spacing

spacing = Spacing()

text = '너무재밓었다그래서보는것을추천한다'
fixed_text = spacing(text) 

print(fixed_text)

너무 재밓었다 그래서 보는 것을 추천한다


In [19]:
tar_df = train_df

tar_df['fixed_document'] = [ spacing(tar_df['document'].iloc[i]) for i in tqdm(range(tar_df.shape[0]))]

100%|██████████| 149995/149995 [1:45:51<00:00, 23.62it/s]


In [20]:
tar_df.to_csv('train_df.csv', index=False)

In [21]:
tar_df = test_df

tar_df['fixed_document'] = [ spacing(tar_df['document'].iloc[i]) for i in tqdm(range(tar_df.shape[0]))]

100%|██████████| 49997/49997 [34:24<00:00, 24.22it/s]


In [22]:
tar_df.to_csv('test_df.csv', index=False)

In [24]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [25]:
def cal_token_len(text, tokenizer):
  return len(tokenizer.encode(text))

In [26]:
tokenizer.encode(train_df.iloc[2595]['fixed_document'])

[2, 2760, 4630, 8721, 3]

In [27]:
tar_df = train_df

tar_df['token_len'] = [ cal_token_len(str(tar_df.iloc[i]['fixed_document']), tokenizer) for i in tqdm(range(tar_df.shape[0])) ]

100%|██████████| 149995/149995 [00:38<00:00, 3917.21it/s]


In [28]:
tar_df = test_df

tar_df['token_len'] = [ cal_token_len(str(tar_df.iloc[i]['fixed_document']), tokenizer) for i in tqdm(range(tar_df.shape[0])) ]

100%|██████████| 49997/49997 [00:14<00:00, 3522.26it/s]


In [29]:
tar_df = train_df

tar_per_list = [95,98,99,100]
tar_col = tar_df['token_len']

for i in tar_per_list:
    print('{}% length : {}'.format(i, np.percentile(tar_col,i)))

95% length : 62.0
98% length : 75.0
99% length : 80.0
100% length : 142.0


In [30]:
tar_df = test_df

tar_per_list = [95,98,99,100]
tar_col = tar_df['token_len']

for i in tar_per_list:
    print('{}% length : {}'.format(i, np.percentile(tar_col,i)))

95% length : 62.0
98% length : 76.0
99% length : 80.0
100% length : 112.0


In [31]:
max_input_len = 100

In [32]:
train_df.keys()

Index(['id', 'document', 'label', 'fixed_document', 'token_len'], dtype='object')

In [33]:
train, valid =  train_test_split(train_df, test_size=0.2, stratify=train_df['label'],random_state=seed, shuffle=True)

In [34]:
class movie_review_dataset(Dataset):

  def __init__(self, df, max_input_len):
    self.df = df
    self.max_input_len = max_input_len

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    return {'enc_inputs' : torch.tensor(tokenizer.encode(str(self.df['fixed_document'].iloc[index]),max_length=self.max_input_len, padding='max_length', truncation='only_first'), dtype=int),
            'labels' : torch.tensor(self.df['label'].iloc[index], dtype=int),
            }

In [35]:
train_dataset = movie_review_dataset(train, max_input_len)
valid_dataset = movie_review_dataset(valid, max_input_len)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=0, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, num_workers=0, shuffle=True)

In [36]:
for i, batch in enumerate(train_dataloader):
    break

In [37]:
# Config Class
# dict class를 json으로 바꿔서 confg.arg 와 같이 사용할 수 있게 만드는 class
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

In [38]:
class TransformerClassificationHead(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.d_model, config.d_model)
        self.dropout = nn.Dropout(config.drop_out_raito)
        self.gelu = nn.GELU()
        self.out_proj = nn.Linear(config.d_model, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :] # Use only CLS token
        x = self.dropout(x)
        x = self.dense(x)
        x = self.gelu(x)  
        x = self.out_proj(x)
        return x

In [60]:
class Transformer_Classifier(Transformer):
    def __init__(self, config):
        super(Transformer, self).__init__()
        self.config = config
        self.encoder = TransformerEncoder(config)
        self.classifier = TransformerClassificationHead(config)

        self.init_weights()

    def forward(self, 
                enc_inputs,  
                enc_self_attn_mask=None):
        
        enc_outputs, enc_self_attn_probs = self.encoder(enc_inputs, enc_self_attn_mask)
        
        logits = self.classifier(enc_outputs)

        return logits, enc_self_attn_probs

In [61]:
config_dict = {
    'vocab_size' : tokenizer.vocab_size,
    'd_model' : 256,
    'max_enc_len' : max_input_len,
    'max_dec_len' : None,
    'num_labels' : 2,
    'pad_id' : tokenizer.pad_token_id,
    'bos_id' : tokenizer.bos_token_id,
    'eos_id' : tokenizer.eos_token_id,
    'use_decoder' : False,
    'init_std' : 2e-2,
    'norm_eps' : 1e-12, 
    'drop_out_raito' : 0.1,
    'num_enc_layers' : 3,
    'num_dec_layers' : 3,
    'num_att_heads' : 4,
    'feed_forward_dim' : 1024,
}

config = Config(config_dict)

In [62]:
model = Transformer_Classifier(config)

In [64]:
enc_inputs = batch['enc_inputs']

summary(model.to('cpu'), input_data=[enc_inputs.to('cpu')])

Layer (type:depth-idx)                                            Output Shape              Param #
Transformer_Classifier                                            [64, 2]                   --
├─TransformerEncoder: 1-1                                         [64, 100, 256]            --
│    └─Embedding: 2-1                                             [64, 100, 256]            8,320,000
│    └─ModuleList: 2-2                                            --                        --
│    │    └─TransformerEncoderLayer: 3-1                          [64, 100, 256]            789,760
│    │    └─TransformerEncoderLayer: 3-2                          [64, 100, 256]            789,760
│    │    └─TransformerEncoderLayer: 3-3                          [64, 100, 256]            789,760
├─TransformerClassificationHead: 1-2                              [64, 2]                   --
│    └─Dropout: 2-3                                               [64, 256]                 --
│    └─Linear: 2-4     

In [65]:
config_dict = {
    'vocab_size' : tokenizer.vocab_size,
    'd_model' : 512,
    'max_enc_len' : max_input_len,
    'max_dec_len' : None,
    'num_labels' : 2,
    'pad_id' : tokenizer.pad_token_id,
    'bos_id' : tokenizer.bos_token_id,
    'eos_id' : tokenizer.eos_token_id,
    'use_decoder' : False,
    'init_std' : 2e-2,
    'norm_eps' : 1e-12, 
    'drop_out_raito' : 0.1,
    'num_enc_layers' : 6,
    'num_dec_layers' : 6,
    'num_att_heads' : 4,
    'feed_forward_dim' : 1024,
}

config = Config(config_dict)

In [66]:
model = Transformer_Classifier(config)

In [67]:
enc_inputs = batch['enc_inputs']

summary(model, input_data=[enc_inputs])

Layer (type:depth-idx)                                            Output Shape              Param #
Transformer_Classifier                                            [64, 2]                   --
├─TransformerEncoder: 1-1                                         [64, 100, 512]            --
│    └─Embedding: 2-1                                             [64, 100, 512]            16,640,000
│    └─ModuleList: 2-2                                            --                        --
│    │    └─TransformerEncoderLayer: 3-1                          [64, 100, 512]            2,102,784
│    │    └─TransformerEncoderLayer: 3-2                          [64, 100, 512]            2,102,784
│    │    └─TransformerEncoderLayer: 3-3                          [64, 100, 512]            2,102,784
│    │    └─TransformerEncoderLayer: 3-4                          [64, 100, 512]            2,102,784
│    │    └─TransformerEncoderLayer: 3-5                          [64, 100, 512]            2,102,784
│ 

In [68]:
model.to(device)

Transformer_Classifier(
  (encoder): TransformerEncoder(
    (word_embedding): Embedding(32500, 512, padding_idx=0)
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attention): AddNorm(
          (layer): MultiHeadAttention(
            (query_proj): Linear(in_features=512, out_features=512, bias=True)
            (key_proj): Linear(in_features=512, out_features=512, bias=True)
            (value_proj): Linear(in_features=512, out_features=512, bias=True)
            (scaled_dot_attn): ScaledDotProductAttention()
            (linear): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
        )
        (feed_forward): AddNorm(
          (layer): PoswiseFeedForward(
            (feed_forward): Sequential(
              (0): Linear(in_features=512, out_features=1024, bias=True)
              (1): Dropout(p=0.1, inplace=False)
              (2): ReLU()
              (3): 

In [69]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [70]:
def cal_acc(y_pred, y_true):
    """
    acc 계산 함수
    :param y_true: 정답 (bs, n_seq)
    :param y_pred: 예측 값 (bs, n_seq, n_vocab)
    """
    # 정답 여부 확인
    y_pred = torch.argmax(y_pred, dim=-1).int()
    matches = torch.eq(y_true, y_pred).int()
    
    # 정확도 계산
    accuracy = torch.sum(matches) / matches.shape[0]
    return accuracy

In [71]:
def train_step(batch, epoch, training):
    batch = {key: value.to(device) for key, value in batch.items()}

    if training is True:
        model.train()
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            logits = model(enc_inputs=batch['enc_inputs'])[0]
            
            # BCE = nn.BCEWithLogitsLoss()
            # loss = BCE(logits, batch['labels'])
            CCE = nn.CrossEntropyLoss()
            loss = CCE(logits.view(-1, config.num_labels), batch['labels'].view(-1))
            acc = cal_acc(logits, batch['labels'])

        loss.backward()
        optimizer.step()
            
        lr = optimizer.param_groups[0]["lr"]

        return loss, acc, round(lr, 10)

    else:
        model.eval()
        with torch.no_grad():
            logits = model(enc_inputs=batch['enc_inputs'])[0]

            # BCE = nn.BCEWithLogitsLoss()
            # loss = BCE(logits, batch['labels'])
            CCE = nn.CrossEntropyLoss()
            loss = CCE(logits.view(-1, config.num_labels), batch['labels'].view(-1))
            acc = cal_acc(logits, batch['labels'])

        return loss, acc

In [72]:
# class color:
GREEN = '\033[92m'
YELLOW = '\033[93m'
END = '\033[0m'

In [73]:
%%time
# train

loss_plot, val_loss_plot = [], []
lrs = []

best_val_acc = 0
best_val_loss = np.inf

best_epoch = 0
patience = 0

for epoch in range(epochs):
    gc.collect()
    total_loss, total_val_loss = 0, 0
    total_acc, total_val_acc = 0, 0
    
    tqdm_dataset = tqdm(enumerate(train_dataloader), total=train_dataloader.__len__())
    training = True
    for batch_idx, batch in tqdm_dataset:
        batch_loss, batch_acc, lr = train_step(batch, epoch, training)
        total_loss += batch_loss
        total_acc += batch_acc
        
        tqdm_dataset.set_postfix({
            '%+10s' % 'Epoch': epoch + 1,
            '%10s' % GREEN + 'Loss' : '{:.4f}'.format(total_loss/(batch_idx+1)) + END,
            '%10s' % YELLOW + 'acc' : '{:.4f}'.format(total_acc/(batch_idx+1)) + END,
            '%5s' % 'LR' : lr,
        })
            
    train_epoch_loss = round(float((total_loss/(batch_idx+1)).detach().cpu()), 4)
    loss_plot.append(train_epoch_loss)
    
    tqdm_dataset = tqdm(enumerate(valid_dataloader), total=valid_dataloader.__len__())
    training = False
    for batch_idx, batch in tqdm_dataset:
        batch_loss, batch_acc = train_step(batch, epoch, training)
        total_val_loss += batch_loss
        total_val_acc += batch_acc

        tqdm_dataset.set_postfix({
            '%+12s' % 'Epoch': epoch + 1,
            '%6s' % GREEN + 'Val Loss' : '{:.4f}'.format(total_val_loss/(batch_idx+1)) + END,
            '%6s' % YELLOW + 'Val acc' : '{:.4f}'.format(total_val_acc/(batch_idx+1)) + END,
        })

    valid_epoch_loss = round(float((total_val_loss/(batch_idx+1)).detach().cpu()), 4)
    val_loss_plot.append(valid_epoch_loss) 

    cur_val_loss = round(float((total_val_loss/(batch_idx+1)).detach().cpu()), 3)
    cur_val_acc = round(float((total_val_acc/(batch_idx+1))), 3)

    if valid_epoch_loss < best_val_loss:
        print(YELLOW + 'Best_Val_Loss is updated from {:>5} to {:>5} on epoch {}'.format(best_val_loss, valid_epoch_loss, epoch+1) + END)
        best_val_loss = valid_epoch_loss
        best_epoch = epoch+1
        torch.save(model.state_dict(), './'+save_name+'.ckpt')
        patience = 0
    else:
        patience += 1
    
    lrs.append(lr)
    
    if patience == early_stopping_patience:
        break

100%|██████████| 1875/1875 [02:35<00:00, 12.06it/s,      Epoch=1,      [92mLoss=0.3924[0m,      [93macc=0.8215[0m,    LR=0.0001]
100%|██████████| 469/469 [00:31<00:00, 14.77it/s,        Epoch=1,  [92mVal Loss=0.3521[0m,  [93mVal acc=0.8505[0m]


[93mBest_Val_Loss is updated from   inf to 0.3521 on epoch 1[0m


100%|██████████| 1875/1875 [02:34<00:00, 12.11it/s,      Epoch=2,      [92mLoss=0.3032[0m,      [93macc=0.8739[0m,    LR=0.0001]
100%|██████████| 469/469 [00:31<00:00, 14.84it/s,        Epoch=2,  [92mVal Loss=0.3612[0m,  [93mVal acc=0.8532[0m]
100%|██████████| 1875/1875 [02:33<00:00, 12.20it/s,      Epoch=3,      [92mLoss=0.2505[0m,      [93macc=0.8985[0m,    LR=0.0001]
100%|██████████| 469/469 [00:31<00:00, 14.94it/s,        Epoch=3,  [92mVal Loss=0.3563[0m,  [93mVal acc=0.8490[0m]
100%|██████████| 1875/1875 [02:33<00:00, 12.21it/s,      Epoch=4,      [92mLoss=0.2012[0m,      [93macc=0.9209[0m,    LR=0.0001]
100%|██████████| 469/469 [00:31<00:00, 14.95it/s,        Epoch=4,  [92mVal Loss=0.4165[0m,  [93mVal acc=0.8405[0m]
100%|██████████| 1875/1875 [02:34<00:00, 12.13it/s,      Epoch=5,      [92mLoss=0.1573[0m,      [93macc=0.9405[0m,    LR=0.0001]
100%|██████████| 469/469 [00:31<00:00, 14.94it/s,        Epoch=5,  [92mVal Loss=0.4823[0m,  [93mVal acc=0.84

CPU times: user 34min 10s, sys: 16.8 s, total: 34min 27s
Wall time: 34min 7s





In [74]:
model.load_state_dict(torch.load('./'+save_name+'.ckpt'))
model.eval()

Transformer_Classifier(
  (encoder): TransformerEncoder(
    (word_embedding): Embedding(32500, 512, padding_idx=0)
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attention): AddNorm(
          (layer): MultiHeadAttention(
            (query_proj): Linear(in_features=512, out_features=512, bias=True)
            (key_proj): Linear(in_features=512, out_features=512, bias=True)
            (value_proj): Linear(in_features=512, out_features=512, bias=True)
            (scaled_dot_attn): ScaledDotProductAttention()
            (linear): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
        )
        (feed_forward): AddNorm(
          (layer): PoswiseFeedForward(
            (feed_forward): Sequential(
              (0): Linear(in_features=512, out_features=1024, bias=True)
              (1): Dropout(p=0.1, inplace=False)
              (2): ReLU()
              (3): 

In [75]:
def inference(text, max_input_len):
    labels = ['부정', '긍정']

    inputs = torch.tensor(tokenizer.encode(str(text),
                                           max_length=max_input_len, 
                                           padding='max_length', 
                                           truncation='only_first'), 
                              dtype=int).unsqueeze(0).to(device)

    logits = model(inputs)[0].detach().to('cpu')
    
    outputs = int(torch.argmax(logits, dim=-1)[0])
    score = float(torch.max(nn.Softmax(dim=-1)(logits)))

    
    return labels[outputs], score

In [76]:
text = '뭐 이따구로 영화를 만들어놨어 ㅋㅋ'
inference(text, config.max_enc_len)

('부정', 0.9134500026702881)

In [77]:
text = '명작이라는 평을 이해할 수가 없다. 이딴 게?'
inference(text, config.max_enc_len)

('부정', 0.7407488226890564)

In [78]:
text = '두 번 봐도 질리지 않는 영화'
inference(text, config.max_enc_len)

('긍정', 0.8983094692230225)

In [79]:
text = '어이가 없네 ㅋㅋㅋㅋ'
inference(text, config.max_enc_len)

('부정', 0.9635804891586304)