In [23]:
!pip install transformers



In [24]:
!pip install accelerate>=0.20.1

In [1]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("device:", device)

device: cuda:0


In [3]:
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=5)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
path_train = '/content/drive/MyDrive/미프4_2/train.csv'

In [6]:
data = pd.read_csv(path_train)

In [7]:
label_dict = {
    '코드1': 0,
    '코드2': 0,
    '웹': 1,
    '이론': 2,
    '시스템 운영': 3,
    '원격': 4
}

In [8]:
data['label'] = data['label'].replace(label_dict)

In [9]:
target = 'label'
x = data.drop(target, axis=1, inplace=False)
y = data.loc[:,target]

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2023, stratify=y)

In [22]:
tokenized_train_sentences = tokenizer(
    list(x_train["text"]),
    return_tensors="pt",                # pytorch의 tensor 형태로 return
    max_length=200,                     # 최대 토큰길이 설정
    padding=True,                       # 제로패딩 설정
    truncation=True,                    # max_length 초과 토큰 truncate
    add_special_tokens=True,            # special token 추가
    )

In [23]:
tokenized_test_sentences = tokenizer(
    list(x_test["text"]),
    return_tensors="pt",
    max_length=200,
    padding=True,
    truncation=True,
    add_special_tokens=True,
    )

In [24]:
class Q_Ko_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [25]:
Y_train = y_train.values

In [26]:
Y_test = y_test.values

In [27]:
train_dataset = Q_Ko_Dataset(tokenized_train_sentences, Y_train)
test_dataset = Q_Ko_Dataset(tokenized_test_sentences, Y_test)

In [28]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [42]:
!pip list | grep accelerate


accelerate                       0.23.0


In [43]:
!pip install transformers --upgrade



In [45]:
!python --version

Python 3.10.12


In [29]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/미프4_2/KoELECTRA',                    # 학습결과 저장경로
    num_train_epochs=10,                # 학습 epoch 설정
    per_device_train_batch_size=32,      # train batch_size 설정
    per_device_eval_batch_size=64,      # test batch_size 설정
    logging_dir='./logs',               # 학습log 저장경로
    logging_steps=20,                  # 학습log 기록 단위
    save_total_limit=2,                 # 학습결과 저장 최대갯수
)

In [30]:
from sklearn.metrics import *
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [31]:
trainer = Trainer(
    model=model,                         # 학습하고자하는 🤗 Transformers model
    args=training_args,                  # 위에서 정의한 Training Arguments
    train_dataset=train_dataset,         # 학습 데이터셋
    eval_dataset=test_dataset,           # 평가 데이터셋
    compute_metrics=compute_metrics,     # 평가지표
)

In [32]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
20,0.32
40,0.2698
60,0.1668
80,0.237
100,0.2469
120,0.1537
140,0.1318
160,0.0995
180,0.1111
200,0.1507


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=1050, training_loss=0.05645445126463615, metrics={'train_runtime': 1085.0249, 'train_samples_per_second': 30.737, 'train_steps_per_second': 0.968, 'total_flos': 3427730488740000.0, 'train_loss': 0.05645445126463615, 'epoch': 10.0})

In [33]:
trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 1.0176702737808228,
 'eval_accuracy': 0.8598382749326146,
 'eval_f1': 0.8572989190062879,
 'eval_precision': 0.8384259782825699,
 'eval_recall': 0.8872770741793745,
 'eval_runtime': 4.3033,
 'eval_samples_per_second': 86.213,
 'eval_steps_per_second': 1.394,
 'epoch': 10.0}

---
# TEST

In [36]:
model.load_state_dict(torch.load('KcELECTRA_model_full.pt', map_location=torch.device('cpu')))
model.eval()

RuntimeError: ignored

In [34]:
test_path = '/content/drive/MyDrive/미프4_2/KoELECTRA/test.csv'
test = pd.read_csv(test_path)

FileNotFoundError: ignored

In [None]:
model.to('cpu')
model.eval()

result = []
for i in range(len(test)):
    temp = model(**tokenizer(test["text"][i],
                             return_tensors="pt",
                             max_length=200,
                             padding=True,
                             truncation=True,
                             add_special_tokens=True))
    logits = temp.logits
    preds = logits.argmax(dim=-1)
    if preds == 0:
        result.append(0)
    elif preds == 1:
        result.append(1)
    elif preds == 2:
        result.append(2)
    elif preds == 3:
        result.append(3)
    elif preds == 4:
        result.append(4)

print(result)

In [None]:
temp = pd.DataFrame({'label' : result})

In [None]:
temp.reset_index(inplace=True)

In [None]:
temp = temp.rename(columns={'index' : 'id'})

In [None]:
temp.to_csv('/content/drive/MyDrive/미프4_2/kaggle_result2.csv', index=False)