# 요약
## 파라미터를 조정 후 학습 진행
다음과 같이 파라미터를 조정함.
- seed = 0
- max_len = 64
- batch_size = 100
- warmup_ratio = 0.1
- num_epochs = 1000
- max_grad_norm = 1
- log_interval = 200
- learning_rate =  1e-5

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회

Mounted at /content/drive
/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회


In [2]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-q6ynt5tl
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-q6ynt5tl
Collecting boto3<=1.15.18
  Downloading boto3-1.15.18-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 7.1 MB/s 
[?25hCollecting gluonnlp<=0.10.0,>=0.6.0
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 59.8 MB/s 
[?25hCollecting mxnet<=1.7.0.post2,>=1.4.0
  Downloading mxnet-1.7.0.post2-py2.py3-none-manylinux2014_x86_64.whl (54.7 MB)
[K     |████████████████████████████████| 54.7 MB 28 kB/s 
[?25hCollecting onnxruntime<=1.8.0,==1.8.0
  Downloading onnxruntime-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[K  

In [39]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd

from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

seed = 0
max_len = 64
batch_size = 100
warmup_ratio = 0.1
num_epochs = 1000
max_grad_norm = 1
log_interval = 200
learning_rate =  1e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'

bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

label_dict = {"유형":{"사실형":0, "추론형":1, "대화형":2, "예측형":3}, "극성":{"긍정":0, "부정":1, "미정":2}, "시제":{"과거":0, "현재":1, "미래":2}, "확실성":{"확실":0, "불확실":1}}

using cached model. /content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/.cache/kobert_v1.zip
using cached model. /content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [45]:
df = pd.read_csv('/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/data/train.csv')
for label in label_dict:
  for i in range(len(df)):
    df[label].iloc[i] = label_dict[label][df[label].iloc[i]]
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=seed, stratify = df["극성"])

In [71]:
test = pd.read_csv("/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/data/test.csv")

In [23]:
# train = pd.read_csv("/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/data/trian_split_by_polarity.csv").drop(columns="Unnamed: 0")
# val = pd.read_csv("/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/data/validation_split_by_polarity.csv").drop(columns="Unnamed: 0")
# test = pd.read_csv("/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/data/test.csv")

In [24]:
train.head()

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_14506,각각 1억1850만원과 3120만원 양육비를 지급하지 않은 채무자 남 모씨와 정모씨...,0,0,2,0,사실형-긍정-미래-확실
1,TRAIN_12438,전문가들은 코로나19에 감염됐을 때 이를 방어하는 면역체계에 반응하는 ＇사이토카인 ...,0,0,1,0,사실형-긍정-현재-확실
2,TRAIN_09657,아직도 많은 팬들이 이들의 재결합을 기대하고 있다.,1,0,1,0,추론형-긍정-현재-확실
3,TRAIN_05727,도멘 드 마르꾸가 연간 생산하는 와인은 6만~7만병이다.,0,0,1,0,사실형-긍정-현재-확실
4,TRAIN_09512,지난 6월 개정된 도로교통법을 기준으로 혈중알코올농도 0.08% 이상은 면허가 취소된다.,0,0,1,0,사실형-긍정-현재-확실


In [25]:
val.head()

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_12470,올해 들어서도 4만7345달러에서 반토막 넘게 가격이 하락했다.,0,0,0,0,사실형-긍정-과거-확실
1,TRAIN_03119,"한편, ＇CNK 주가조작 사건＇은 CNK인터내셔널이 2010년 카메룬 다이아몬드 광...",0,0,1,0,사실형-긍정-현재-확실
2,TRAIN_10361,"주민들의 민원을 듣고, 지역구 행사에 가느라 국회의원의 기본 업무인 상임위원회에 참...",0,1,1,0,사실형-부정-현재-확실
3,TRAIN_15694,서울우유협동조합은 우유 크림의 퓨전 디저트 ＇호랑이도 반한 크림떡＇ 3종을 내놨다.,0,0,0,0,사실형-긍정-과거-확실
4,TRAIN_14237,거래량 감소 등 직접적인 타격도 있지만 그보다는 거시경제 전반 상황이 악화됨에 따른...,0,0,1,0,사실형-긍정-현재-확실


In [26]:
# test.head()

In [46]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [47]:
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len,
                 pad, pair, train):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.train=train
        self.sentences = [transform([i]) for i in dataset["문장"]]
        if train:
          self.type_labels = [np.int32(i) for i in dataset["유형"]]
          self.polarity_labels = [np.int32(i) for i in dataset["극성"]]
          self.tense_labels = [np.int32(i) for i in dataset["시제"]]
          self.certainty_labels = [np.int32(i) for i in dataset["확실성"]]

    def __getitem__(self, i):
        if self.train:
          return (self.sentences[i] + (self.type_labels[i], self.polarity_labels[i], self.tense_labels[i], self.certainty_labels[i]))
        return (self.sentences[i],)

    def __len__(self):
        return (len(self.sentences))

In [48]:
data_train = BERTDataset(train, tok, max_len, True, False, True)
data_val = BERTDataset(val, tok, max_len, True, False, True)

In [49]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
val_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, shuffle=False, num_workers=0)

In [50]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.type_classifier = nn.Linear(hidden_size , 4)
        self.polarity_classifier = nn.Linear(hidden_size , 3)
        self.tense_classifier = nn.Linear(hidden_size , 3)
        self.certainty_classifier = nn.Linear(hidden_size , 2)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.type_classifier(out), self.polarity_classifier(out), self.tense_classifier(out), self.certainty_classifier(out)

In [51]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [52]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [53]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [54]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [55]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [56]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
    with torch.no_grad():
        for batch_id, (token_ids, valid_length, segment_ids, type_label, polarity_label, tense_label, certainty_label) in tqdm(enumerate(val_loader), total=len(val_loader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            type_label = type_label.long().to(device)
            polarity_label = polarity_label.long().to(device)
            tense_label = tense_label.long().to(device)
            certainty_label = certainty_label.long().to(device)
            type_logit, polarity_logit, tense_logit, certainty_logit = model(token_ids, valid_length, segment_ids)
            
            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                        0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                        0.25 * criterion['tense'](tense_logit, tense_label) + \
                        0.25 * criterion['certainty'](certainty_logit, certainty_label)
            
            val_loss.append(loss.item())
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            type_labels += type_label.detach().cpu().numpy().tolist()
            
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_labels += polarity_label.detach().cpu().numpy().tolist()
            
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_labels += tense_label.detach().cpu().numpy().tolist()
            
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_labels += certainty_label.detach().cpu().numpy().tolist()
    
    type_f1 = f1_score(type_labels, type_preds, average='weighted')
    polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
    tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
    certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
    return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1
def train(model, optimizer, train_loader, val_loader, scheduler, device):

  log_df = {"train_losses":[], "val_losses":[], "type_f1s":[], "polarity_f1s":[], "tense_f1s":[], "certainty_f1s":[]}

  for e in range(num_epochs):

      criterion = {
          'type' : nn.CrossEntropyLoss().to(device),
          'polarity' : nn.CrossEntropyLoss().to(device),
          'tense' : nn.CrossEntropyLoss().to(device),
          'certainty' : nn.CrossEntropyLoss().to(device)
      }

      model.train()
      train_loss = []
      for batch_id, (token_ids, valid_length, segment_ids, type_labels, polarity_labels, tense_labels, certainty_labels) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length = valid_length
          type_labels = type_labels.long().to(device)
          polarity_labels = polarity_labels.long().to(device)
          tense_labels = tense_labels.long().to(device)
          certainty_labels = certainty_labels.long().to(device)
          out = model(token_ids, valid_length, segment_ids)

          loss = 0.25 * criterion['type'](out[0], type_labels) + \
                      0.25 * criterion['polarity'](out[1], polarity_labels) + \
                      0.25 * criterion['tense'](out[2], tense_labels) + \
                      0.25 * criterion['certainty'](out[3], certainty_labels)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_loss.append(loss.item())

      model.eval()
      val_loss, type_f1, polarity_f1, tense_f1, certainty_f1 = validation(model, val_loader, criterion, device)
      print(f'Epoch : [{e}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{type_f1:.5f}] 극성 F1 : [{polarity_f1:.5f}] 시제 F1 : [{tense_f1:.5f}] 확실성 F1 : [{certainty_f1:.5f}]')
      
      log_df["val_losses"].append(val_loss)
      log_df["train_losses"].append(np.mean(train_loss))
      log_df["type_f1s"].append(type_f1)
      log_df["polarity_f1s"].append(polarity_f1)
      log_df["tense_f1s"].append(tense_f1)
      log_df["certainty_f1s"].append(certainty_f1)

      pd.DataFrame(log_df).to_csv('/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/saved_models/log.csv')
      torch.save(model.state_dict(), f"/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/saved_models/model_state_dict{str(e+1).zfill(4)}.pth")

  return model

In [57]:
infer_model = train(model, optimizer, train_dataloader, val_dataloader, scheduler, device)

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch : [0] Train Loss : [1.09096] Val Loss : [1.05346] 유형 F1 : [0.14893] 극성 F1 : [0.91925] 시제 F1 : [0.05287] 확실성 F1 : [0.40494]


  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [1.06282] Val Loss : [1.01207] 유형 F1 : [0.39740] 극성 F1 : [0.93075] 시제 F1 : [0.09969] 확실성 F1 : [0.63223]


  0%|          | 0/132 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [58]:
pretrained_dict = torch.load(r'/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/saved_models/model_state_dict0043.pth')
model.load_state_dict(pretrained_dict)
infer_model = model

In [72]:
data_test = BERTDataset(test, tok, max_len, True, False, train=False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, shuffle=False, num_workers=0)

In [73]:
def inference(infer_model, test_dataloader, device):
    infer_model.to(device)
    infer_model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
      for batch_id, i in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids, valid_length, segment_ids = i[0][0], i[0][1], i[0][2]
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        type_logit, polarity_logit, tense_logit, certainty_logit = model(token_ids, valid_length, segment_ids)
        
        type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
        polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
        tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
        certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
        
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [74]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(infer_model, test_dataloader, device)

  0%|          | 0/71 [00:00<?, ?it/s]

In [75]:
label_dict = {"유형":["사실형", "추론형", "대화형", "예측형"], "극성":["긍정", "부정", "미정"], "시제":["과거", "현재", "미래"], "확실성":["확실", "불확실"]}

In [76]:
type_preds = [label_dict["유형"][x] for x in type_preds]
polarity_preds = [label_dict["극성"][x] for x in polarity_preds]
tense_preds = [label_dict["시제"][x] for x in tense_preds]
certainty_preds = [label_dict["확실성"][x] for x in certainty_preds]

In [77]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [78]:
submit = pd.read_csv('/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/data/sample_submission.csv')
submit

Unnamed: 0,ID,label
0,TEST_0000,추론형-긍정-현재-확실
1,TEST_0001,추론형-긍정-현재-확실
2,TEST_0002,추론형-긍정-현재-확실
3,TEST_0003,추론형-긍정-현재-확실
4,TEST_0004,추론형-긍정-현재-확실
...,...,...
7085,TEST_7085,추론형-긍정-현재-확실
7086,TEST_7086,추론형-긍정-현재-확실
7087,TEST_7087,추론형-긍정-현재-확실
7088,TEST_7088,추론형-긍정-현재-확실


In [79]:
len(predictions)

7090

In [80]:
submit['label'] = predictions

In [None]:
submit.head()

In [81]:
submit.to_csv('/content/drive/MyDrive/[데이콘]문장 유형 분류 AI 경진대회/submission/submission.csv', index=False)