
# Pytorch + HuggingFace 
## KoElectra Model
박장원님의 KoElectra-small 사용<br>
https://monologg.kr/2020/05/02/koelectra-part1/<br>
https://github.com/monologg/KoELECTRA

## References
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249

### setting

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 35.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 62.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AdamW, ElectraForPreTraining
from tqdm.notebook import tqdm
import numpy as np
from torch import nn
from sklearn.metrics import f1_score
import random
import os

In [3]:
epochs = 1000
start_epoch = 0
batch_size = 50 # max_batch_size base 50, small 160
learning_rate = 5e-6
seed = 0

In [4]:
# GPU 사용
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_grad_norm = 1

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed) # Seed 고정

### Dataset 만들어서 불러오기 

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
label_dict = {"유형":{"사실형":0, "추론형":1, "대화형":2, "예측형":3}, "극성":{"긍정":0, "부정":1, "미정":2}, "시제":{"과거":0, "현재":1, "미래":2}, "확실성":{"확실":0, "불확실":1}}

In [8]:
df = pd.read_csv("/content/drive/MyDrive/NLP/trainset/train_drop_duplicates.csv")
for label in label_dict:
  for i in range(len(df)):
    df[label].iloc[i] = label_dict[label][df[label].iloc[i]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [9]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.2, random_state=seed, stratify = df["유형"])

In [10]:
train_type_counts = list(train["유형"].value_counts())
train_polarity_counts = list(train["극성"].value_counts())
train_tense_counts = list(train["시제"].value_counts())
train_certainty_counts = list(train["확실성"].value_counts())
val_type_counts = list(val["유형"].value_counts())
val_polarity_counts = list(val["극성"].value_counts())
val_tense_counts = list(val["시제"].value_counts())
val_certainty_counts = list(val["확실성"].value_counts())

In [11]:
print(train_type_counts, val_type_counts)
print(train_polarity_counts, val_polarity_counts)
print(train_tense_counts, val_tense_counts)
print(train_certainty_counts, val_certainty_counts)

[10821, 1717, 458, 205] [2706, 429, 115, 51]
[12609, 443, 149] [3145, 122, 34]
[6433, 5437, 1331] [1580, 1412, 309]
[12127, 1074] [3033, 268]


In [12]:
# train = pd.read_csv("/content/drive/MyDrive/NLP/trainset/train_aug_type_fin_drop.csv")
# val = pd.read_csv("/content/drive/MyDrive/NLP/trainset/val_type.csv")
# for label in label_dict:
#   for i in range(len(val)):
#     val[label].iloc[i] = label_dict[label][val[label].iloc[i]]

# for label in label_dict:
#   for i in range(len(train)):
#     train[label].iloc[i] = label_dict[label][train[label].iloc[i]]

In [13]:
class ElectraDataset(Dataset):
  
  def __init__(self, dataset, train):
    self.dataset = dataset
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v2-discriminator")
    self.sentences = self.dataset["문장"].tolist()
    self.train = train
    if self.train:
      self.type_labels = [np.int32(i) for i in self.dataset["유형"]]
      self.polarity_labels = [np.int32(i) for i in self.dataset["극성"]]
      self.tense_labels = [np.int32(i) for i in self.dataset["시제"]]
      self.certainty_labels = [np.int32(i) for i in self.dataset["확실성"]]

  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    
      inputs = self.tokenizer(
          self.sentences[idx], 
          return_tensors='pt',
          truncation=True,
          max_length=256,
          pad_to_max_length=True,
          add_special_tokens=True
          )
      input_ids = inputs['input_ids'][0]
      attention_mask = inputs['attention_mask'][0]
      if self.train:
        return input_ids, attention_mask, self.type_labels[idx], self.polarity_labels[idx], self.tense_labels[idx], self.certainty_labels[idx]
      return input_ids, attention_mask

In [14]:
train_dataset = ElectraDataset(train, True)
val_dataset = ElectraDataset(val, True)
# train_dataset = ElectraDataset("/content/drive/MyDrive/NLP/trainset/trian_split_by_type.csv", True)
# val_dataset = ElectraDataset("/content/drive/MyDrive/NLP/trainset/validation_split_by_type.csv", True)

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

### Create Model

In [15]:
electra = ElectraForPreTraining.from_pretrained("monologg/koelectra-base-v2-discriminator")

# 한번 실행해보기
# text, attention_mask, y1, y2, y3, y4 = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [16]:
# model.load_state_dict(torch.load("model.pt"))

In [17]:
# 모델 레이어 보기
# electra

In [18]:
electra.to(device)

ElectraForPreTraining(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

In [19]:
# text, attention_mask, y1, y2, y3, y4 = train_dataset[0]
# electra(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

In [20]:
class ElectraClassifier(nn.Module):
    def __init__(self, electra):
        super(ElectraClassifier, self).__init__()
        self.electra = electra
        self.type_classifier = nn.Linear(256 , 4)
        self.polarity_classifier = nn.Linear(256 , 3)
        self.tense_classifier = nn.Linear(256 , 3)
        self.certainty_classifier = nn.Linear(256 , 2)
  
    def forward(self, input_ids, attention_mask):
        out = self.electra(input_ids, attention_mask = attention_mask).logits
        return (self.type_classifier(out), self.polarity_classifier(out), self.tense_classifier(out), self.certainty_classifier(out))

In [21]:
model = ElectraClassifier(electra)

In [22]:
model.to(device)

ElectraClassifier(
  (electra): ElectraForPreTraining(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(32200, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_features=768, out_

In [23]:
# class FocalLoss(nn.Module):
#     def __init__(self, gamma=2, alpha=0.25):
#         super(FocalLoss, self).__init__()
#         self.loss_fn = nn.BCEWithLogitsLoss()
#         self.gamma = gamma
#         self.alpha = alpha

#     def forward(self, pred, true):
#         bceloss = self.loss_fn(pred, true.float())
#         pred_prob = torch.sigmoid(pred)  # p  pt는 p가 true 이면 pt = p / false 이면 pt = 1 - p
#         alpha_factor = true * self.alpha + (1-true) * (1 - self.alpha)  # add balance
#         modulating_factor = torch.abs(true - pred_prob) ** self.gamma  # focal term
#         loss = alpha_factor * modulating_factor * bceloss  # bceloss에 이미 음수가 들어가 있음

#         return loss.mean()

### load_pretrain model

In [None]:
### runtime 오류로 기존모델에서 불러올때
start_epoch = 1
pretrained_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/model_state_dict0001.pth', map_location=device)
model.load_state_dict(pretrained_dict)

<All keys matched successfully>

### Learn

In [24]:
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)



In [25]:
# def validation(model, val_loader, criterion, device):
#     model.eval()
#     val_loss = []
    
#     type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
#     type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
#     with torch.no_grad():
#       for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(val_loader):
#         type_label = type_label.long().to(device)
#         polarity_label = polarity_label.long().to(device)
#         tense_label = tense_label.long().to(device)
#         certainty_label = certainty_label.long().to(device)
#         type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))
#         loss = 0.25 * criterion['type'](type_logit, type_label) + \
#                     0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
#                     0.25 * criterion['tense'](tense_logit, tense_label) + \
#                     0.25 * criterion['certainty'](certainty_logit, certainty_label)
          
#         val_loss.append(loss.item())
        
#         type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
#         type_labels += type_label.detach().cpu().numpy().tolist()
        
#         polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
#         polarity_labels += polarity_label.detach().cpu().numpy().tolist()
        
#         tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
#         tense_labels += tense_label.detach().cpu().numpy().tolist()
        
#         certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
#         certainty_labels += certainty_label.detach().cpu().numpy().tolist()
#     type_f1 = f1_score(type_labels, type_preds, average='weighted')
#     polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
#     tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
#     certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
#     return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

In [26]:
# def train(model, optimizer, train_loader, val_loader, device):
#   log_df = {"train_losses":[], "val_losses":[], "type_f1s":[], "polarity_f1s":[], "tense_f1s":[], "certainty_f1s":[]}

#   criterion = {
#       'type' : nn.CrossEntropyLoss().to(device),
#       'polarity' : nn.CrossEntropyLoss().to(device),
#       'tense' : nn.CrossEntropyLoss().to(device),
#       'certainty' : nn.CrossEntropyLoss().to(device)
#   }
  
#   for e in range(start_epoch, epochs):
#     train_loss = []
#     for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(train_loader):
#       model.train()
#       optimizer.zero_grad()
#       type_label = type_label.long().to(device)
#       polarity_label = polarity_label.long().to(device)
#       tense_label = tense_label.long().to(device)
#       certainty_label = certainty_label.long().to(device)
#       type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))

#       loss = 0.25 * criterion['type'](type_logit, type_label) + \
#                   0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
#                   0.25 * criterion['tense'](tense_logit, tense_label) + \
#                   0.25 * criterion['certainty'](certainty_logit, certainty_label)

#       loss.backward()
#       torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#       optimizer.step()
#       train_loss.append(loss.item())

#     model.eval()
#     val_loss, type_f1, polarity_f1, tense_f1, certainty_f1 = validation(model, val_loader, criterion, device)
#     print(f'Epoch : [{e+1}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{type_f1:.5f}] 극성 F1 : [{polarity_f1:.5f}] 시제 F1 : [{tense_f1:.5f}] 확실성 F1 : [{certainty_f1:.5f}]')
    
#     log_df["val_losses"].append(val_loss)
#     log_df["train_losses"].append(np.mean(train_loss))
#     log_df["type_f1s"].append(type_f1)
#     log_df["polarity_f1s"].append(polarity_f1)
#     log_df["tense_f1s"].append(tense_f1)
#     log_df["certainty_f1s"].append(certainty_f1)

#     pd.DataFrame(log_df).to_csv('/content/drive/MyDrive/NLP/saved_models/log.csv')
#     torch.save(model.state_dict(), f"/content/drive/MyDrive/NLP/saved_models/model_state_dict{str(e+1).zfill(4)}.pth")

In [27]:
# def validation(model, val_loader, criterion, device):
#     model.eval()
#     val_loss = []
    
#     type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
#     type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
#     with torch.no_grad():
#       for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(val_loader):
#         type_label_hot = F.one_hot(type_label.to(torch.int64), num_classes=4).float().to(device)
#         polarity_label_hot = F.one_hot(polarity_label.to(torch.int64), num_classes=3).float().to(device)
#         tense_label_hot = F.one_hot(tense_label.to(torch.int64), num_classes=3).float().to(device)
#         certainty_label_hot = F.one_hot(certainty_label.to(torch.int64), num_classes=2).float().to(device) 
#         type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))
#         loss = 0.25 * criterion['type'](type_logit, type_label_hot) + \
#                     0.25 * criterion['polarity'](polarity_logit, polarity_label_hot) + \
#                     0.25 * criterion['tense'](tense_logit, tense_label_hot) + \
#                     0.25 * criterion['certainty'](certainty_logit, certainty_label_hot)
          
#         val_loss.append(loss.item())

#         type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
#         type_labels += type_label.detach().cpu().numpy().tolist()

#         polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
#         polarity_labels += polarity_label.detach().cpu().numpy().tolist()
        
#         tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
#         tense_labels += tense_label.detach().cpu().numpy().tolist()
        
#         certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
#         certainty_labels += certainty_label.detach().cpu().numpy().tolist()

#     type_f1 = f1_score(type_labels, type_preds, average='weighted')
#     polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
#     tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
#     certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
#     return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

In [28]:
# def train(model, optimizer, train_loader, val_loader, device):
#   log_df = {"train_losses":[], "val_losses":[], "type_f1s":[], "polarity_f1s":[], "tense_f1s":[], "certainty_f1s":[]}

#   criterion = {
#       'type' : FocalLoss().to(device),
#       'polarity' : FocalLoss().to(device),
#       'tense' : FocalLoss().to(device),
#       'certainty' : FocalLoss().to(device)
#   }

#   for e in range(start_epoch, epochs):
#     train_loss = []
#     for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(train_loader):
#       model.train()
#       optimizer.zero_grad()
      
#       type_label = F.one_hot(type_label.to(torch.int64), num_classes=4).float().to(device)
#       polarity_label = F.one_hot(polarity_label.to(torch.int64), num_classes=3).float().to(device)
#       tense_label = F.one_hot(tense_label.to(torch.int64), num_classes=3).float().to(device)
#       certainty_label = F.one_hot(certainty_label.to(torch.int64), num_classes=2).float().to(device) 
#       type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))

#       loss = 0.25 * criterion['type'](type_logit, type_label) + \
#                   0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
#                   0.25 * criterion['tense'](tense_logit, tense_label) + \
#                   0.25 * criterion['certainty'](certainty_logit, certainty_label)

#       loss.backward()
#       torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#       optimizer.step()
#       train_loss.append(loss.item())
      
#     model.eval()
#     val_loss, type_f1, polarity_f1, tense_f1, certainty_f1 = validation(model, val_loader, criterion, device)
#     print(f'Epoch : [{e+1}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{type_f1:.5f}] 극성 F1 : [{polarity_f1:.5f}] 시제 F1 : [{tense_f1:.5f}] 확실성 F1 : [{certainty_f1:.5f}]')
    
#     log_df["val_losses"].append(val_loss)
#     log_df["train_losses"].append(np.mean(train_loss))
#     log_df["type_f1s"].append(type_f1)
#     log_df["polarity_f1s"].append(polarity_f1)
#     log_df["tense_f1s"].append(tense_f1)
#     log_df["certainty_f1s"].append(certainty_f1)

#     pd.DataFrame(log_df).to_csv('/content/drive/MyDrive/NLP/saved_models/log.csv')
#     torch.save(model.state_dict(), f"/content/drive/MyDrive/NLP/saved_models/model_state_dict{str(e+1).zfill(4)}.pth")

In [29]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
    with torch.no_grad():
      for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(val_loader):
        type_label = type_label.long().to(device)
        polarity_label = polarity_label.long().to(device)
        tense_label = tense_label.long().to(device)
        certainty_label = certainty_label.long().to(device)
        type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))
        loss = 0.25 * criterion['type'](type_label, type_logit, val_type_counts, 4, "softmax", 0.999) + \
              0.25 * criterion['polarity'](polarity_label, polarity_logit, val_polarity_counts, 3, "softmax", 0.999) + \
              0.25 * criterion['tense'](tense_label, tense_logit, val_tense_counts, 3, "softmax", 0.999) + \
              0.25 * criterion['certainty'](certainty_label, certainty_logit, val_certainty_counts, 2, "softmax", 0.999)

        val_loss.append(loss.item())
        
        type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
        type_labels += type_label.detach().cpu().numpy().tolist()
        
        polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
        polarity_labels += polarity_label.detach().cpu().numpy().tolist()
        
        tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
        tense_labels += tense_label.detach().cpu().numpy().tolist()
        
        certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
        certainty_labels += certainty_label.detach().cpu().numpy().tolist()
    type_f1 = f1_score(type_labels, type_preds, average='weighted')
    polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
    tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
    certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
    return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

In [30]:
def train(model, optimizer, train_loader, val_loader, device):
  log_df = {"train_losses":[], "val_losses":[], "type_f1s":[], "polarity_f1s":[], "tense_f1s":[], "certainty_f1s":[]}

  criterion = {
      'type' : CB_loss,
      'polarity' : CB_loss,
      'tense' : CB_loss,
      'certainty' : CB_loss
  }
  
  for e in range(start_epoch, epochs):
    train_loss = []
    for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(train_loader):
      model.train()
      optimizer.zero_grad()
      type_label = type_label.long().to(device)
      polarity_label = polarity_label.long().to(device)
      tense_label = tense_label.long().to(device)
      certainty_label = certainty_label.long().to(device)
      type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))

      loss = 0.25 * criterion['type'](type_label, type_logit, train_type_counts, 4, "softmax", 0.999) + \
                  0.25 * criterion['polarity'](polarity_label, polarity_logit, train_polarity_counts, 3, "softmax", 0.999) + \
                  0.25 * criterion['tense'](tense_label, tense_logit, train_tense_counts, 3, "softmax", 0.999) + \
                  0.25 * criterion['certainty'](certainty_label, certainty_logit, train_certainty_counts, 2, "softmax", 0.999)

      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
      optimizer.step()
      train_loss.append(loss.item())

    model.eval()
    val_loss, type_f1, polarity_f1, tense_f1, certainty_f1 = validation(model, val_loader, criterion, device)
    print(f'Epoch : [{e+1}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{type_f1:.5f}] 극성 F1 : [{polarity_f1:.5f}] 시제 F1 : [{tense_f1:.5f}] 확실성 F1 : [{certainty_f1:.5f}]')
    
    log_df["val_losses"].append(val_loss)
    log_df["train_losses"].append(np.mean(train_loss))
    log_df["type_f1s"].append(type_f1)
    log_df["polarity_f1s"].append(polarity_f1)
    log_df["tense_f1s"].append(tense_f1)
    log_df["certainty_f1s"].append(certainty_f1)

    pd.DataFrame(log_df).to_csv('/content/drive/MyDrive/NLP/saved_models/log.csv')
    torch.save(model.state_dict(), f"/content/drive/MyDrive/NLP/saved_models/model_state_dict{str(e+1).zfill(4)}.pth")

In [31]:
def CB_loss(labels, logits, samples_per_cls, no_of_classes, loss_type, beta):
    """Compute the Class Balanced Loss between `logits` and the ground truth `labels`.
    Class Balanced Loss: ((1-beta)/(1-beta^n))*Loss(labels, logits)
    where Loss is one of the standard losses used for Neural Networks.
    Args:
      labels: A int tensor of size [batch].
      logits: A float tensor of size [batch, no_of_classes].
      samples_per_cls: A python list of size [no_of_classes].
      no_of_classes: total number of classes. int
      loss_type: string. One of "sigmoid", "focal", "softmax".
      beta: float. Hyperparameter for Class balanced loss.
      gamma: float. Hyperparameter for Focal loss.
    Returns:
      cb_loss: A float tensor representing class balanced loss
    """
    effective_num = 1.0 - np.power(beta, samples_per_cls)
    weights = (1.0 - beta) / np.array(effective_num)
    weights = weights / np.sum(weights) * no_of_classes

    labels_one_hot = F.one_hot(labels, no_of_classes).float().to(device)

    weights = torch.tensor(weights).float()
    weights = weights.unsqueeze(0)
    weights = weights.repeat(labels_one_hot.shape[0],1) * labels_one_hot
    weights = weights.sum(1)
    weights = weights.unsqueeze(1)
    weights = weights.repeat(1,no_of_classes)
    loss = F.binary_cross_entropy.to(device)
    if loss_type == "sigmoid":
        cb_loss = F.binary_cross_entropy_with_logits(input = logits,target = labels_one_hot, weights = weights)
    elif loss_type == "softmax":
        pred = logits.softmax(dim = 1)
        cb_loss = loss(input = pred, target = labels_one_hot, weight = weights)
    return cb_loss

In [32]:
model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/265 [00:00<?, ?it/s]



RuntimeError: ignored

### Inference

In [None]:
pretrained_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/model_state_dict0004.pth', map_location=device)
model.load_state_dict(pretrained_dict)
infer_model = model

In [None]:
data_test = ElectraDataset("/content/drive/MyDrive/NLP/trainset/test.csv", False)
test_dataloader = DataLoader(data_test, batch_size=batch_size)

In [None]:
len(data_test)

7090

In [None]:
def inference(infer_model, test_dataloader, device):
    infer_model.to(device)
    infer_model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
      for i in tqdm(test_dataloader):

        type_logit, polarity_logit, tense_logit, certainty_logit = infer_model(i[0].to(device), i[1].to(device))
        
        type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
        polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
        tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
        certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
        
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [None]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(infer_model, test_dataloader, device)

  0%|          | 0/142 [00:00<?, ?it/s]



In [None]:
label_dict = {"유형":["사실형", "추론형", "대화형", "예측형"], "극성":["긍정", "부정", "미정"], "시제":["과거", "현재", "미래"], "확실성":["확실", "불확실"]}

In [None]:
len(type_preds)

7090

In [None]:
type_preds = [label_dict["유형"][x] for x in type_preds]
polarity_preds = [label_dict["극성"][x] for x in polarity_preds]
tense_preds = [label_dict["시제"][x] for x in tense_preds]
certainty_preds = [label_dict["확실성"][x] for x in certainty_preds]

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [None]:
# print(len(val["문장"]), len(polarity_preds))
# pd.DataFrame({"sentence":val["문장"], "유형":[label_dict["유형"][x] for x in val["유형"]], "극성":[label_dict["극성"][x] for x in val["극성"]], "시제":[label_dict["시제"][x] for x in val["시제"]], "확실성":[label_dict["확실성"][x] for x in val["확실성"]], "type_preds":type_preds, "polarity_preds":polarity_preds, "tense_preds":tense_preds, "certainty_preds":certainty_preds, "predictions":predictions}).to_csv("./prediction.csv")

In [None]:
len(predictions)

7090

### Submission

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/NLP/sample_submission.csv')
submit['label'] = predictions

In [None]:
submit.head()

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-과거-확실
4,TEST_0004,사실형-긍정-과거-확실


In [None]:
submit.to_csv('/content/drive/MyDrive/NLP/submissions/submission.csv', index=False)

### validation to csv

In [None]:
pretrained_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/model_state_dict0018.pth', map_location=device)
model.load_state_dict(pretrained_dict)
infer_model = model

In [None]:
data_test = ElectraDataset("/content/drive/MyDrive/NLP/trainset/validation_split_by_polarity.csv", True)
test_dataloader = DataLoader(data_test, batch_size=batch_size)

In [None]:
def inference(infer_model, test_dataloader, device):
    infer_model.to(device)
    infer_model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
      for i in tqdm(test_dataloader):

        type_logit, polarity_logit, tense_logit, certainty_logit = infer_model(i[0].to(device), i[1].to(device))
        
        type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
        polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
        tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
        certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
        
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [None]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(infer_model, test_dataloader, device)

In [None]:
label_dict = {"유형":["사실형", "추론형", "대화형", "예측형"], "극성":["긍정", "부정", "미정"], "시제":["과거", "현재", "미래"], "확실성":["확실", "불확실"]}

In [None]:
len(type_preds)

In [None]:
type_preds = [label_dict["유형"][x] for x in type_preds]
polarity_preds = [label_dict["극성"][x] for x in polarity_preds]
tense_preds = [label_dict["시제"][x] for x in tense_preds]
certainty_preds = [label_dict["확실성"][x] for x in certainty_preds]

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [None]:
val = pd.read_csv("/content/drive/MyDrive/NLP/trainset/validation_split_by_polarity.csv")
print(len(val["문장"]), len(polarity_preds))
pd.DataFrame({"sentence":val["문장"], "유형":[label_dict["유형"][x] for x in val["유형"]], "극성":[label_dict["극성"][x] for x in val["극성"]], "시제":[label_dict["시제"][x] for x in val["시제"]], "확실성":[label_dict["확실성"][x] for x in val["확실성"]], "type_preds":type_preds, "polarity_preds":polarity_preds, "tense_preds":tense_preds, "certainty_preds":certainty_preds, "predictions":predictions}).to_csv("/content/drive/MyDrive/NLP/submissions/prediction.csv")

### preprocessing

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/NLP/trainset/train_aug_type_fin.csv")

In [None]:
print(len(df), len(set(df["문장"]))) ### 중복 개수 35개
df = df.drop_duplicates(["문장", "label"], keep='first').drop_duplicates(["문장"], keep=False) ## 문장, label이 모두 같음: keep first, 중복 제거후 문장만 같음(라벨이 다름), keep: False
print(len(df["문장"]), len(set(df["문장"])))

18997 18586
18585 18585


In [None]:
df.to_csv("/content/drive/MyDrive/NLP/trainset/train_aug_type_fin.csv")

In [None]:
label_dict = {"유형":{"사실형":0, "추론형":1, "대화형":2, "예측형":3}, "극성":{"긍정":0, "부정":1, "미정":2}, "시제":{"과거":0, "현재":1, "미래":2}, "확실성":{"확실":0, "불확실":1}}

In [None]:
for label in label_dict:
  for i in range(len(df)):
    df[label].iloc[i] = label_dict[label][df[label].iloc[i]]

In [None]:
from sklearn.model_selection import train_test_split
seed = 0

In [None]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=seed, stratify = df["극성"])

In [None]:
train.to_csv('/content/drive/MyDrive/NLP/trainset/train_split_by_polarity.csv')

In [None]:
val.to_csv('/content/drive/MyDrive/NLP/trainset/validation_split_by_polarity.csv')