
# Pytorch + HuggingFace 
## KoElectra Model
박장원님의 KoElectra-small 사용<br>
https://monologg.kr/2020/05/02/koelectra-part1/<br>
https://github.com/monologg/KoELECTRA

## References
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249

### setting

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 12.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 74.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AdamW, ElectraForSequenceClassification
from tqdm.notebook import tqdm
import numpy as np
from torch import nn
from sklearn.metrics import f1_score
import random
import os

In [3]:
epochs = 1000
start_epoch = 0
batch_size = 160 # max_batch_size base 50, small 160
learning_rate = 5e-6
seed = 0

In [4]:
# GPU 사용
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_grad_norm = 1

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed) # Seed 고정

In [6]:
now = "확실성"
class_label = {"유형":["type", 4], "극성":["polarity", 3], "시제":["tense", 3], "확실성":["certainty", 2]}

### Dataset 만들어서 불러오기 

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
class ElectraDataset(Dataset):
  
  def __init__(self, csv_file, train):
    self.dataset = pd.read_csv(csv_file)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
    self.sentences = self.dataset["문장"].tolist()
    self.train = train
    if self.train:
      self.labels = [np.int32(i) for i in self.dataset[now]]


  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    
      inputs = self.tokenizer(
          self.sentences[idx], 
          return_tensors='pt',
          truncation=True,
          max_length=256,
          pad_to_max_length=True,
          add_special_tokens=True
          )
      input_ids = inputs['input_ids'][0]
      attention_mask = inputs['attention_mask'][0]
      if self.train:
        return input_ids, attention_mask, self.labels[idx]
      return input_ids, attention_mask

In [9]:
train_dataset = ElectraDataset(f"/content/drive/MyDrive/NLP/trainset/train_split_by_{class_label[now][0]}.csv", True)
val_dataset = ElectraDataset(f"/content/drive/MyDrive/NLP/trainset/validation_split_by_{class_label[now][0]}.csv", True)
# train_dataset = ElectraDataset("/content/drive/MyDrive/NLP/trainset/trian_split_by_type.csv", True)
# val_dataset = ElectraDataset("/content/drive/MyDrive/NLP/trainset/validation_split_by_type.csv", True)

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/486 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

### Create Model

In [10]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=class_label[now][1])

# 한번 실행해보기
# text, attention_mask, y1, y2, y3, y4 = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Downloading:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [11]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [12]:
# class FocalLoss(nn.Module):
#     def __init__(self, gamma=2, alpha=0.25):
#         super(FocalLoss, self).__init__()
#         self.loss_fn = nn.BCEWithLogitsLoss()
#         self.gamma = gamma
#         self.alpha = alpha

#     def forward(self, pred, true):
#         bceloss = self.loss_fn(pred, true.float())
#         pred_prob = torch.sigmoid(pred)  # p  pt는 p가 true 이면 pt = p / false 이면 pt = 1 - p
#         alpha_factor = true * self.alpha + (1-true) * (1 - self.alpha)  # add balance
#         modulating_factor = torch.abs(true - pred_prob) ** self.gamma  # focal term
#         loss = alpha_factor * modulating_factor * bceloss  # bceloss에 이미 음수가 들어가 있음

#         return loss.mean()

### load_pretrain model

In [None]:
### runtime 오류로 기존모델에서 불러올때
start_epoch = 1
pretrained_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/certainty/certaintymodel_state_dict0012.pth', map_location=device)
model.load_state_dict(pretrained_dict)

### Learn

In [13]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)



In [14]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    preds = []
    labels = []
    
    
    with torch.no_grad():
      for input_ids_batch, attention_masks_batch, label in tqdm(val_loader):
        label = label.long().to(device)
        logit = model(input_ids_batch.to(device), attention_masks_batch.to(device)).logits
        loss = criterion(logit, label)
          
        val_loss.append(loss.item())
        
        preds += logit.argmax(1).detach().cpu().numpy().tolist()
        labels += label.detach().cpu().numpy().tolist()
        
    f1 = f1_score(labels, preds, average='weighted')
    
    return np.mean(val_loss), f1

In [15]:
def train(model, optimizer, train_loader, val_loader, device):
  log_df = {"train_losses":[], "val_losses":[], "f1s":[]}

  criterion = nn.CrossEntropyLoss().to(device)
  
  for e in range(start_epoch, epochs):
    train_loss = []
    for input_ids_batch, attention_masks_batch, label in tqdm(train_loader):
      model.train()
      optimizer.zero_grad()
      label = label.long().to(device)
      logit = model(input_ids_batch.to(device), attention_masks_batch.to(device)).logits

      loss = criterion(logit, label)

      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
      optimizer.step()
      train_loss.append(loss.item())

    model.eval()
    val_loss, f1 = validation(model, val_loader, criterion, device)
    print(f'Epoch : [{e+1}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] f1 : [{f1:.5f}]')
    
    log_df["val_losses"].append(val_loss)
    log_df["train_losses"].append(np.mean(train_loss))
    log_df["f1s"].append(f1)

    pd.DataFrame(log_df).to_csv('/content/drive/MyDrive/NLP/saved_models/log.csv')
    torch.save(model.state_dict(), f"/content/drive/MyDrive/NLP/saved_models/{class_label[now][0]}model_state_dict{str(e+1).zfill(4)}.pth")

In [16]:
# def validation(model, val_loader, criterion, device):
#     model.eval()
#     val_loss = []
    
#     type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
#     type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
#     with torch.no_grad():
#       for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(val_loader):
#         type_label_hot = F.one_hot(type_label.to(torch.int64), num_classes=4).float().to(device)
#         polarity_label_hot = F.one_hot(polarity_label.to(torch.int64), num_classes=3).float().to(device)
#         tense_label_hot = F.one_hot(tense_label.to(torch.int64), num_classes=3).float().to(device)
#         certainty_label_hot = F.one_hot(certainty_label.to(torch.int64), num_classes=2).float().to(device) 
#         type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))
#         loss = 0.25 * criterion['type'](type_logit, type_label_hot) + \
#                     0.25 * criterion['polarity'](polarity_logit, polarity_label_hot) + \
#                     0.25 * criterion['tense'](tense_logit, tense_label_hot) + \
#                     0.25 * criterion['certainty'](certainty_logit, certainty_label_hot)
          
#         val_loss.append(loss.item())

#         type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
#         type_labels += type_label.detach().cpu().numpy().tolist()

#         polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
#         polarity_labels += polarity_label.detach().cpu().numpy().tolist()
        
#         tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
#         tense_labels += tense_label.detach().cpu().numpy().tolist()
        
#         certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
#         certainty_labels += certainty_label.detach().cpu().numpy().tolist()

#     type_f1 = f1_score(type_labels, type_preds, average='weighted')
#     polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
#     tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
#     certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
#     return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

In [17]:
# def train(model, optimizer, train_loader, val_loader, device):
#   log_df = {"train_losses":[], "val_losses":[], "type_f1s":[], "polarity_f1s":[], "tense_f1s":[], "certainty_f1s":[]}

#   criterion = {
#       'type' : FocalLoss().to(device),
#       'polarity' : FocalLoss().to(device),
#       'tense' : FocalLoss().to(device),
#       'certainty' : FocalLoss().to(device)
#   }

#   for e in range(start_epoch, epochs):
#     train_loss = []
#     for input_ids_batch, attention_masks_batch, type_label, polarity_label, tense_label, certainty_label in tqdm(train_loader):
#       model.train()
#       optimizer.zero_grad()
      
#       type_label = F.one_hot(type_label.to(torch.int64), num_classes=4).float().to(device)
#       polarity_label = F.one_hot(polarity_label.to(torch.int64), num_classes=3).float().to(device)
#       tense_label = F.one_hot(tense_label.to(torch.int64), num_classes=3).float().to(device)
#       certainty_label = F.one_hot(certainty_label.to(torch.int64), num_classes=2).float().to(device) 
#       type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids_batch.to(device), attention_masks_batch.to(device))

#       loss = 0.25 * criterion['type'](type_logit, type_label) + \
#                   0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
#                   0.25 * criterion['tense'](tense_logit, tense_label) + \
#                   0.25 * criterion['certainty'](certainty_logit, certainty_label)

#       loss.backward()
#       torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#       optimizer.step()
#       train_loss.append(loss.item())
      
#     model.eval()
#     val_loss, type_f1, polarity_f1, tense_f1, certainty_f1 = validation(model, val_loader, criterion, device)
#     print(f'Epoch : [{e+1}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{type_f1:.5f}] 극성 F1 : [{polarity_f1:.5f}] 시제 F1 : [{tense_f1:.5f}] 확실성 F1 : [{certainty_f1:.5f}]')
    
#     log_df["val_losses"].append(val_loss)
#     log_df["train_losses"].append(np.mean(train_loss))
#     log_df["type_f1s"].append(type_f1)
#     log_df["polarity_f1s"].append(polarity_f1)
#     log_df["tense_f1s"].append(tense_f1)
#     log_df["certainty_f1s"].append(certainty_f1)

#     pd.DataFrame(log_df).to_csv('/content/drive/MyDrive/NLP/saved_models/log.csv')
#     torch.save(model.state_dict(), f"/content/drive/MyDrive/NLP/saved_models/model_state_dict{str(e+1).zfill(4)}.pth")

In [18]:
model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.62005] Val Loss : [0.54076] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.49984] Val Loss : [0.43998] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.42848] Val Loss : [0.38353] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.38169] Val Loss : [0.34619] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.34932] Val Loss : [0.32162] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.32600] Val Loss : [0.30574] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.30984] Val Loss : [0.29853] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.29933] Val Loss : [0.28696] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.29207] Val Loss : [0.28521] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.28816] Val Loss : [0.28096] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [11] Train Loss : [0.28513] Val Loss : [0.28096] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [12] Train Loss : [0.28430] Val Loss : [0.28030] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



  0%|          | 0/21 [00:00<?, ?it/s]

Epoch : [13] Train Loss : [0.28288] Val Loss : [0.28114] f1 : [0.87994]


  0%|          | 0/83 [00:00<?, ?it/s]



KeyboardInterrupt: ignored

### Inference

In [None]:
model_type = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=4)
model_polarity = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=3)
model_tense = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=3)
model_certainty = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=2)
type_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/type/typemodel_state_dict0042.pth')
polarity_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/polarity/polaritymodel_state_dict0034.pth')
tense_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/tense/tensemodel_state_dict0026.pth')
certainty_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/certainty/certaintymodel_state_dict0012.pth')
model_type.load_state_dict(type_dict)
model_polarity.load_state_dict(polarity_dict)
model_tense.load_state_dict(tense_dict)
model_certainty.load_state_dict(certainty_dict)

In [17]:
print(model_type)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [18]:
data_test = ElectraDataset("/content/drive/MyDrive/NLP/trainset/test.csv", False)
test_dataloader = DataLoader(data_test, batch_size=batch_size)

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

In [19]:
len(data_test)

7090

In [22]:
def inference(model_type, model_polarity, model_tense, model_certainty, test_dataloader, device):
    model_type.to(device)
    model_polarity.to(device)
    model_tense.to(device)
    model_certainty.to(device)
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
      for input_ids, attention_mask in tqdm(test_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        type_logit = model_type(input_ids, attention_mask).logits
        polarity_logit = model_polarity(input_ids, attention_mask).logits
        tense_logit = model_tense(input_ids, attention_mask).logits
        certainty_logit = model_certainty(input_ids, attention_mask).logits
        
        type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
        polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
        tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
        certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
        
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [23]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(model_type, model_polarity, model_tense, model_certainty, test_dataloader, device)

  0%|          | 0/45 [00:00<?, ?it/s]



In [24]:
label_dict = {"유형":["사실형", "추론형", "대화형", "예측형"], "극성":["긍정", "부정", "미정"], "시제":["과거", "현재", "미래"], "확실성":["확실", "불확실"]}

In [25]:
len(type_preds)

7090

In [26]:
type_preds = [label_dict["유형"][x] for x in type_preds]
polarity_preds = [label_dict["극성"][x] for x in polarity_preds]
tense_preds = [label_dict["시제"][x] for x in tense_preds]
certainty_preds = [label_dict["확실성"][x] for x in certainty_preds]

In [27]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [None]:
# print(len(val["문장"]), len(polarity_preds))
# pd.DataFrame({"sentence":val["문장"], "유형":[label_dict["유형"][x] for x in val["유형"]], "극성":[label_dict["극성"][x] for x in val["극성"]], "시제":[label_dict["시제"][x] for x in val["시제"]], "확실성":[label_dict["확실성"][x] for x in val["확실성"]], "type_preds":type_preds, "polarity_preds":polarity_preds, "tense_preds":tense_preds, "certainty_preds":certainty_preds, "predictions":predictions}).to_csv("./prediction.csv")

In [None]:
len(predictions)

### Submission

In [28]:
submit = pd.read_csv('/content/drive/MyDrive/NLP/sample_submission.csv')
submit['label'] = predictions

In [29]:
submit.head()

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-현재-확실
4,TEST_0004,사실형-긍정-과거-확실


In [30]:
submit.to_csv('/content/drive/MyDrive/NLP/submissions/submission.csv', index=False)

### validation to csv

In [None]:
model_type = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=4)
model_polarity = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=3)
model_tense = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=3)
model_certainty = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator", num_labels=2)
type_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/type/typemodel_state_dict0042.pth')
polarity_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/polarity/polaritymodel_state_dict0034.pth')
tense_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/tense/tensemodel_state_dict0026.pth')
certainty_dict = torch.load(r'/content/drive/MyDrive/NLP/saved_models/221217/certainty/certaintymodel_state_dict0012.pth')
model_type.load_state_dict(type_dict)
model_polarity.load_state_dict(polarity_dict)
model_tense.load_state_dict(tense_dict)
model_certainty.load_state_dict(certainty_dict)

In [None]:
data_test = ElectraDataset("/content/drive/MyDrive/NLP/trainset/validation_split_by_polarity.csv", True)
test_dataloader = DataLoader(data_test, batch_size=batch_size)

In [None]:
def inference(infer_model, test_dataloader, device):
    infer_model.to(device)
    infer_model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
      for i in tqdm(test_dataloader):

        type_logit, polarity_logit, tense_logit, certainty_logit = infer_model(i[0].to(device), i[1].to(device))
        
        type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
        polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
        tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
        certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
        
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [None]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(infer_model, test_dataloader, device)

In [None]:
label_dict = {"유형":["사실형", "추론형", "대화형", "예측형"], "극성":["긍정", "부정", "미정"], "시제":["과거", "현재", "미래"], "확실성":["확실", "불확실"]}

In [None]:
len(type_preds)

In [None]:
type_preds = [label_dict["유형"][x] for x in type_preds]
polarity_preds = [label_dict["극성"][x] for x in polarity_preds]
tense_preds = [label_dict["시제"][x] for x in tense_preds]
certainty_preds = [label_dict["확실성"][x] for x in certainty_preds]

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [None]:
val = pd.read_csv("/content/drive/MyDrive/NLP/trainset/validation_split_by_polarity.csv")
print(len(val["문장"]), len(polarity_preds))
pd.DataFrame({"sentence":val["문장"], "유형":[label_dict["유형"][x] for x in val["유형"]], "극성":[label_dict["극성"][x] for x in val["극성"]], "시제":[label_dict["시제"][x] for x in val["시제"]], "확실성":[label_dict["확실성"][x] for x in val["확실성"]], "type_preds":type_preds, "polarity_preds":polarity_preds, "tense_preds":tense_preds, "certainty_preds":certainty_preds, "predictions":predictions}).to_csv("/content/drive/MyDrive/NLP/submissions/prediction.csv")

### preprocessing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/NLP/trainset/train.csv")
print(len(df), len(set(df["문장"]))) ### 중복 개수 35개
df = df.drop_duplicates(["문장", "label"], keep='first').drop_duplicates(["문장"], keep=False) ## 문장, label이 모두 같음: keep first, 중복 제거후 문장만 같음(라벨이 다름), keep: False
print(len(df["문장"]), len(set(df["문장"])))

16541 16506
16502 16502


In [None]:
df.to_csv("/content/drive/MyDrive/NLP/trainset/train_drop_duplicates.csv")

In [5]:
label_dict = {"유형":{"사실형":0, "추론형":1, "대화형":2, "예측형":3}, "극성":{"긍정":0, "부정":1, "미정":2}, "시제":{"과거":0, "현재":1, "미래":2}, "확실성":{"확실":0, "불확실":1}}

In [6]:
for label in label_dict:
  for i in range(len(df)):
    df[label].iloc[i] = label_dict[label][df[label].iloc[i]]

In [7]:
from sklearn.model_selection import train_test_split
seed = 0

In [None]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=seed, stratify = df["극성성"])

In [None]:
train.to_csv('/content/drive/MyDrive/NLP/trainset/train_split_by_polarity.csv')

In [None]:
val.to_csv('/content/drive/MyDrive/NLP/trainset/validation_split_by_polarity.csv')

In [20]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=seed, stratify = df["유형"])

In [21]:
train.to_csv('/content/drive/MyDrive/NLP/trainset/train_split_by_type.csv')

In [22]:
val.to_csv('/content/drive/MyDrive/NLP/trainset/validation_split_by_type.csv')

In [23]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=seed, stratify = df["시제"])

In [24]:
train.to_csv('/content/drive/MyDrive/NLP/trainset/train_split_by_tense.csv')

In [25]:
val.to_csv('/content/drive/MyDrive/NLP/trainset/validation_split_by_tense.csv')

In [28]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=seed, stratify = df["확실성"])

In [29]:
train.to_csv('/content/drive/MyDrive/NLP/trainset/train_split_by_certainty.csv')

In [30]:
val.to_csv('/content/drive/MyDrive/NLP/trainset/validation_split_by_certainty.csv')