In [1]:
!git clone https://github.com/toy-f-rebellion/toy_ai.git
%cd /content/toy_ai
!git clone https://github.com/Oneul-hyeon/Modified-KoBERT.git
!pip install -r requirements.txt

Cloning into 'toy_ai'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 65 (delta 15), reused 14 (delta 3), pack-reused 0[K
Receiving objects: 100% (65/65), 1.57 MiB | 4.77 MiB/s, done.
Resolving deltas: 100% (15/15), done.
/content/toy_ai
Cloning into 'Modified-KoBERT'...
remote: Enumerating objects: 449, done.[K
remote: Counting objects: 100% (174/174), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 449 (delta 136), reused 118 (delta 108), pack-reused 275[K
Receiving objects: 100% (449/449), 226.32 KiB | 5.14 MiB/s, done.
Resolving deltas: 100% (227/227), done.
Collecting kobert_tokenizer (from -r requirements.txt (line 11))
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-1vltc20n/kobert-tokenizer_efbb4502afb94f34b48b254aa812fc3f
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /t

# 1. Environment Setting

## (1) 라이브러리 설정

In [2]:
# Setting Library
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('/content/toy_ai/Modified-KoBERT')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gluonnlp as nlp
import torch
from torch.utils.data import Dataset, DataLoader
import pickle

# koBERT
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

## (2). 데이터 불러오기

In [3]:
data = pd.read_csv('/content/toy_ai/Dataset/5차년도_2차.csv', encoding = 'CP949')

## (3) KoBERT 모델 불러오기

In [4]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

/content/toy_ai/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/toy_ai/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


# 2. Data Preprocessing

## (1) Labeling

- 라벨값 확인

In [5]:
data['상황'].unique()

array(['happiness', 'neutral', 'sadness', 'angry', 'surprise', 'disgust',
       'fear'], dtype=object)

- 라벨링

In [6]:
labels = list(data['상황'].unique())
for i in range(len(labels)) :
  data.loc[data['상황'] == labels[i], '상황'] = i

## (2) KoBERT 모델의 입력 데이터 생성

In [7]:
dataset = []
for text, label in zip(data['발화문'], data['상황'])  :
    data = []
    data.append(text)
    data.append(str(label))
    dataset.append(data)

## (3) Train/Test Dataset Split

In [8]:
train_data, test_data = train_test_split(dataset, test_size = 0.2,random_state = 42)

## (4) 토큰화

- 함수 정의

In [9]:
# 각 데이터가 BERT 모델의 입력으로 들어갈 수 있도록 함수 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [10]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/toy_ai/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


- Setting parameters

In [11]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 1000
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

- 토큰화

In [12]:
train_data = BERTDataset(train_data, 0, 1, tok, max_len, True, False)
test_data = BERTDataset(test_data,0, 1, tok, max_len, True, False)

- torch 형식으로 반환

In [13]:
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=5)

# 3. 데이터 저장

In [14]:
# train_dataloder를 파일에 저장
with open('/content/toy_ai/train_dataloder.pkl', 'wb') as f:
    pickle.dump(train_dataloader, f)
# test_dataloder를 파일에 저장
with open('/content/toy_ai/test_dataloader.pkl', 'wb') as f:
    pickle.dump(test_dataloader, f)