In [None]:
!pip install transformers==4.7 torchinfo

In [None]:
!pip install git+https://github.com/microsoft/LoRA

In [None]:
!git clone https://github.com/Taeksu-Kim/LoRA.git

In [None]:
cd LoRA

In [None]:
!gdown https://drive.google.com/uc?id=1-9-CJgquS0z9ZihSboSla_uwJ3ouvpxP
!gdown https://drive.google.com/uc?id=1-0kFcLCBtFvI4n8agpykJpzdUahkvbcF

In [7]:
# common
import math
import random
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from torchinfo import summary

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from transformers import AutoTokenizer
from transformers import ElectraForSequenceClassification

# custom
from convert_lora import LoRA_converter

In [8]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
    torch.cuda.manual_seed_all(seed)

seed = 42

seed_everything(seed)

In [9]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model_path = "monologg/koelectra-base-v3-discriminator"

epochs = 300
learning_rate = 1e-4
weight_decay = 1e-2
batch_size = 64

early_stopping_patience = 5

save_name = 'NSMC_model'

In [10]:
train_df = pd.read_csv('train.csv')

In [11]:
train_df.isna().sum()

id                 0
document           0
label              0
fixed_document    10
dtype: int64

In [12]:
train_df = train_df.dropna()

In [13]:
test_df = pd.read_csv('test.csv')

In [14]:
test_df.isna().sum()

id                0
document          0
label             0
fixed_document    3
dtype: int64

In [15]:
test_df = test_df.dropna()

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

In [17]:
def cal_token_len(text, tokenizer):
    return len(tokenizer.encode(text))

In [18]:
tar_df = train_df

tar_df['token_len'] = [ cal_token_len(str(tar_df.iloc[i]['fixed_document']), tokenizer) for i in tqdm(range(tar_df.shape[0])) ]

100%|██████████| 149985/149985 [00:41<00:00, 3656.45it/s]


In [19]:
tar_df = test_df

tar_df['token_len'] = [ cal_token_len(str(tar_df.iloc[i]['fixed_document']), tokenizer) for i in tqdm(range(tar_df.shape[0])) ]

100%|██████████| 49994/49994 [00:13<00:00, 3702.67it/s]


In [20]:
tar_df = train_df

tar_per_list = [95,98,99,100]
tar_col = tar_df['token_len']

for i in tar_per_list:
    print('{}% length : {}'.format(i, np.percentile(tar_col,i)))

95% length : 62.0
98% length : 76.0
99% length : 80.0
100% length : 142.0


In [21]:
tar_df = test_df

tar_per_list = [95,98,99,100]
tar_col = tar_df['token_len']

for i in tar_per_list:
    print('{}% length : {}'.format(i, np.percentile(tar_col,i)))

95% length : 62.0
98% length : 76.0
99% length : 80.0
100% length : 112.0


In [22]:
max_input_len = 100

In [23]:
train_df.keys()

Index(['id', 'document', 'label', 'fixed_document', 'token_len'], dtype='object')

In [24]:
train, valid =  train_test_split(train_df, test_size=0.2, stratify=train_df['label'],random_state=seed, shuffle=True)

In [25]:
class movie_review_dataset(Dataset):

  def __init__(self, df, max_input_len):
    self.df = df
    self.max_input_len = max_input_len

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    batch = tokenizer(self.df['fixed_document'].iloc[index],
                      max_length=self.max_input_len, 
                      padding='max_length', 
                      truncation='only_first',
                      return_tensors='pt',
                      )
    batch = {key: value[0] for key, value in batch.items()}

    batch['labels'] = torch.tensor(self.df['label'].iloc[index], dtype=int)

    return batch

In [26]:
train_dataset = movie_review_dataset(train, max_input_len)
valid_dataset = movie_review_dataset(valid, max_input_len)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=0, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, num_workers=0, shuffle=True)

In [27]:
for i, batch in enumerate(train_dataloader):
    break

In [28]:
base_model = ElectraForSequenceClassification.from_pretrained(model_path, num_labels=2)

Downloading:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [29]:
summary(base_model)

Layer (type:depth-idx)                                       Param #
ElectraForSequenceClassification                             --
├─ElectraModel: 1-1                                          --
│    └─ElectraEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   26,880,000
│    │    └─Embedding: 3-2                                   393,216
│    │    └─Embedding: 3-3                                   1,536
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─ElectraEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,054,464
├─ElectraClassificationHead: 1-2                             --
│    └─Linear: 2-3                                           590,592
│    └─Dropout: 2-4                                          --
│    └─Linear: 2-5                                           1,538


In [30]:
print(base_model)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [31]:
tar_linear_layers = {'electra': ['query', 'key', 'value']}
# tar_embedding_layers = {'electra': ['shared', 'embed_tokens', 'relative_attention_bias']}
tar_embedding_layers = {}
keep_layers = {}

model = LoRA_converter(base_model=base_model, 
                       tar_linear_layers=tar_linear_layers,
                       tar_embedding_layers=tar_embedding_layers,
                       keep_layers=keep_layers,
                       lora_r=16, 
                       lora_alpha=1,
                       lora_dropout=0.1).base_model

Replace electra.encoder.layer.0.attention.self.query with lora linear
Replace electra.encoder.layer.0.attention.self.query with lora linear
Replace electra.encoder.layer.0.attention.self.key with lora linear
Replace electra.encoder.layer.0.attention.self.key with lora linear
Replace electra.encoder.layer.0.attention.self.value with lora linear
Replace electra.encoder.layer.0.attention.self.value with lora linear
Replace electra.encoder.layer.1.attention.self.query with lora linear
Replace electra.encoder.layer.1.attention.self.query with lora linear
Replace electra.encoder.layer.1.attention.self.key with lora linear
Replace electra.encoder.layer.1.attention.self.key with lora linear
Replace electra.encoder.layer.1.attention.self.value with lora linear
Replace electra.encoder.layer.1.attention.self.value with lora linear
Replace electra.encoder.layer.2.attention.self.query with lora linear
Replace electra.encoder.layer.2.attention.self.query with lora linear
Replace electra.encoder.laye

In [32]:
summary(model)

Layer (type:depth-idx)                                       Param #
ElectraForSequenceClassification                             --
├─ElectraModel: 1-1                                          --
│    └─ElectraEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   (26,880,000)
│    │    └─Embedding: 3-2                                   (393,216)
│    │    └─Embedding: 3-3                                   (1,536)
│    │    └─LayerNorm: 3-4                                   (1,536)
│    │    └─Dropout: 3-5                                     --
│    └─ElectraEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,939,200
├─ElectraClassificationHead: 1-2                             --
│    └─Linear: 2-3                                           (590,592)
│    └─Dropout: 2-4                                          --
│    └─Linear: 2-5                                       

In [33]:
tar_linear_layers = {'electra': ['query', 'key', 'value']}
# tar_embedding_layers = {'electra': ['shared', 'embed_tokens', 'relative_attention_bias']}
tar_embedding_layers = {}
keep_layers = {'classifier':['dense']}

model = LoRA_converter(base_model=base_model, 
                       tar_linear_layers=tar_linear_layers,
                       tar_embedding_layers=tar_embedding_layers,
                       keep_layers=keep_layers,
                       lora_r=16, 
                       lora_alpha=1,
                       lora_dropout=0.1).base_model

Replace electra.encoder.layer.0.attention.self.query with lora linear
Replace electra.encoder.layer.0.attention.self.query with lora linear
Replace electra.encoder.layer.0.attention.self.query with lora linear
Replace electra.encoder.layer.0.attention.self.query with lora linear
Replace electra.encoder.layer.0.attention.self.key with lora linear
Replace electra.encoder.layer.0.attention.self.key with lora linear
Replace electra.encoder.layer.0.attention.self.key with lora linear
Replace electra.encoder.layer.0.attention.self.key with lora linear
Replace electra.encoder.layer.0.attention.self.value with lora linear
Replace electra.encoder.layer.0.attention.self.value with lora linear
Replace electra.encoder.layer.0.attention.self.value with lora linear
Replace electra.encoder.layer.0.attention.self.value with lora linear
Replace electra.encoder.layer.1.attention.self.query with lora linear
Replace electra.encoder.layer.1.attention.self.query with lora linear
Replace electra.encoder.laye

In [34]:
summary(model)

Layer (type:depth-idx)                                       Param #
ElectraForSequenceClassification                             --
├─ElectraModel: 1-1                                          --
│    └─ElectraEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   (26,880,000)
│    │    └─Embedding: 3-2                                   (393,216)
│    │    └─Embedding: 3-3                                   (1,536)
│    │    └─LayerNorm: 3-4                                   (1,536)
│    │    └─Dropout: 3-5                                     --
│    └─ElectraEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,939,200
├─ElectraClassificationHead: 1-2                             --
│    └─Linear: 2-3                                           590,592
│    └─Dropout: 2-4                                          --
│    └─Linear: 2-5                                         

In [35]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
              )
              (key): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
              )
              (value): Linear(
                in_features=768, out_features=768, bias=True
                (l

In [36]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [37]:
def cal_acc(y_pred, y_true):
    """
    acc 계산 함수
    :param y_true: 정답 (bs, n_seq)
    :param y_pred: 예측 값 (bs, n_seq, n_vocab)
    """
    # 정답 여부 확인
    y_pred = torch.argmax(y_pred, dim=-1).int()
    matches = torch.eq(y_true, y_pred).int()
    
    # 정확도 계산
    accuracy = torch.sum(matches) / matches.shape[0]
    return accuracy

In [38]:
def train_step(batch, epoch, training):

    batch = {key: value.to(device) for key, value in batch.items()}

    if training is True:
        model.train()
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            outputs = model(**batch)
            logits = outputs['logits']
            loss = outputs['loss']

            acc = cal_acc(logits, batch['labels'])

        loss.backward()
        optimizer.step()
            
        lr = optimizer.param_groups[0]["lr"]

        return loss, acc, round(lr, 10)

    else:
        model.eval()
        with torch.no_grad():

            outputs = model(**batch)
            logits = outputs['logits']
            loss = outputs['loss']

            acc = cal_acc(logits, batch['labels'])

        return loss, acc

In [39]:
# class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'

In [None]:
%%time
# train

loss_plot, val_loss_plot = [], []
lrs = []

check_list = []

best_val_acc = 0
best_val_loss = 100

best_epoch = 0
patience = 0

for epoch in range(epochs):
    gc.collect()
    total_loss, total_val_loss = 0, 0
    total_acc, total_val_acc = 0, 0
    
    tqdm_dataset = tqdm(enumerate(train_dataloader), total=train_dataloader.__len__())
    training = True
    for batch_idx, batch in tqdm_dataset:
        batch_loss, batch_acc, lr = train_step(batch, epoch, training)
        total_loss += batch_loss
        total_acc += batch_acc
        
        tqdm_dataset.set_postfix({
            '%+10s' % 'Epoch': epoch + 1,
            '%10s' % GREEN + 'Loss' : '{:.4f}'.format(total_loss/(batch_idx+1)) + END,
            '%10s' % YELLOW + 'acc' : '{:.4f}'.format(total_acc/(batch_idx+1)) + END,
            '%5s' % 'LR' : lr,
        })
            
    loss_plot.append(total_loss/(batch_idx+1))
    
    tqdm_dataset = tqdm(enumerate(valid_dataloader), total=valid_dataloader.__len__())
    training = False
    for batch_idx, batch in tqdm_dataset:
        batch_loss, batch_acc = train_step(batch, epoch, training)
        total_val_loss += batch_loss
        total_val_acc += batch_acc

        tqdm_dataset.set_postfix({
            '%+12s' % 'Epoch': epoch + 1,
            '%6s' % GREEN + 'Val Loss' : '{:.4f}'.format(total_val_loss/(batch_idx+1)) + END,
            '%6s' % YELLOW + 'Val acc' : '{:.4f}'.format(total_val_acc/(batch_idx+1)) + END,
        })
    val_loss_plot.append(total_val_loss/(batch_idx+1)) 

    cur_val_loss = round(float((total_val_loss/(batch_idx+1)).detach().cpu()), 3)
    cur_val_acc = round(float((total_val_acc/(batch_idx+1))), 3)

    if cur_val_acc > best_val_acc:
        print(YELLOW + 'Best_Val_acc is updated from {:>5} to {:>5} on epoch {}'.format(best_val_acc, cur_val_acc, epoch+1) + END)
        best_val_acc = cur_val_acc
        best_epoch = epoch+1
        torch.save(model.state_dict(), './'+save_name+'.ckpt')

    else:
        patience += 1
    
    lrs.append(lr)
    
    if patience == early_stopping_patience:
        break

In [None]:
best_val_acc

In [40]:
model.load_state_dict(torch.load('./'+save_name+'.ckpt'))
model.eval()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
              )
              (key): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
              )
              (value): Linear(
                in_features=768, out_features=768, bias=True
                (l

In [41]:
def inference(text, max_input_len):
    labels = ['부정', '긍정']

    inputs = torch.tensor(tokenizer.encode(str(text),
                                           max_length=max_input_len, 
                                           padding='max_length', 
                                           truncation='only_first'), 
                              dtype=int).unsqueeze(0).to(device)

    logits = model(inputs)[0].detach().to('cpu')
    
    outputs = int(torch.argmax(logits, dim=-1)[0])
    score = float(torch.max(nn.Softmax(dim=-1)(logits)))

    
    return labels[outputs], score

In [42]:
text = '뭐 이따구로 영화를 만들어놨어 ㅋㅋ'
inference(text, max_input_len)

('부정', 0.9271074533462524)

In [43]:
text = '명작이라는 평을 이해할 수가 없다. 이딴 게?'
inference(text, max_input_len)

('부정', 0.6981436610221863)

In [44]:
text = '두 번 봐도 질리지 않는 영화'
inference(text, max_input_len)

('긍정', 0.962390124797821)

In [45]:
text = '어이가 없네 ㅋㅋㅋㅋ'
inference(text, max_input_len)

('부정', 0.862804114818573)