In [1]:
from IPython.display import clear_output
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from typing import List

## 모델 진행상황 결과 확인하는 라이브러리
from tqdm.notebook import tqdm

## Huggingface 라이브러리 받아오는 코드
!pip install transformers datasets tokenizers sentencepiece x-transformers pytorch-lightning wandb
!wget https://raw.githubusercontent.com/monologg/KoBERT-Transformers/master/kobert_transformers/tokenization_kobert.py
from transformers import DistilBertModel
from tokenization_kobert import KoBertTokenizer

## 기타 사용할 머신러닝 알고리즘
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

clear_output()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_path = '/content/drive/MyDrive/의현/018_감성대화/Training_221115_add/원천데이터/감성대화말뭉치(최종데이터)_Training.zip'
val_path = '/content/drive/MyDrive/의현/018_감성대화/Validation_221115_add/원천데이터/감성대화말뭉치(최종데이터)_Validation.zip'
!cp -r "$train_path" ./
!cp -r "$val_path" ./
drive.flush_and_unmount()

In [4]:
!unzip './감성대화말뭉치(최종데이터)_Training.zip'
!unzip './감성대화말뭉치(최종데이터)_Validation.zip'
clear_output()

In [5]:
import os
import shutil
from sys import platform
from glob import glob

train_data_path = './감성대화말뭉치(최종데이터)_Training.xlsx'
val_data_path = './감성대화말뭉치(최종데이터)_Validation.xlsx'

if platform == "linux" or platform == "linux2":
    pass
elif platform == "darwin":
    train_data_path = os.path.join("dset", train_data_path)
    val_data_path = os.path.join("dset", val_data_path)

In [124]:
import numpy as np
import pandas as pd

train_dataset = pd.read_excel(train_data_path, index_col = 'Unnamed: 0')
val_dataset = pd.read_excel(val_data_path, index_col = 'Unnamed: 0')

In [125]:
train_dataset[['사람문장1', '사람문장2', '사람문장3']] = train_dataset[['사람문장1', '사람문장2', '사람문장3']].fillna('').astype(str)
train_dataset['sentence'] = train_dataset[['사람문장1', '사람문장2', '사람문장3']].apply(lambda x: ' '.join(x), axis=1)
train_dataset = train_dataset.drop(['사람문장1', '사람문장2', '사람문장3', '시스템문장1', '시스템문장2', '시스템문장3'], axis=1)
train_dataset.rename(columns = {'감정_대분류' : 'voice_sentiment', '감정_소분류' : 'sentiment'}, inplace = True)

In [126]:
val_dataset[['사람문장1', '사람문장2', '사람문장3']] = val_dataset[['사람문장1', '사람문장2', '사람문장3']].fillna('').astype(str)
val_dataset['sentence'] = val_dataset[['사람문장1', '사람문장2', '사람문장3']].apply(lambda x: ' '.join(x), axis=1)
val_dataset = val_dataset.drop(['사람문장1', '사람문장2', '사람문장3', '시스템문장1', '시스템문장2', '시스템문장3'], axis=1)
val_dataset.rename(columns = {'감정_대분류' : 'voice_sentiment', '감정_소분류' : 'sentiment'}, inplace = True)

In [129]:
train_size = 0.8
train = train_dataset.sample(frac=train_size, random_state=200)
test = train_dataset.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

X_train = train.drop('sentiment', axis = 1)
y_train = train['sentiment']
X_test = test.drop('sentiment', axis = 1)
y_test = test['sentiment']
X_val = val_dataset.drop('sentiment', axis = 1)
y_val = val_dataset['sentiment']

In [131]:
le = preprocessing.LabelEncoder()

train_labels = le.fit_transform(y_train.values)

In [132]:
num_unique_labels = len(le.classes_)
print("독립적인 레이블의 개수:", num_unique_labels)

독립적인 레이블의 개수: 58


In [133]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 1e-04
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
clear_output()

In [134]:
class Dataset_Generation:
    def __init__(self, df, labels, tokenizer, max_len):
        self.len = len(df)
        self.data = df
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_len = max_len

    def __getitem__(self, idx):
        title = ", ".join(str(_) for _ in self.data.loc[idx, :].values)
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [136]:
train_dataset = X_train
test_dataset = X_test
val_dataset = X_val
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))

test_labels = le.transform(y_test.values)
val_labels = le.transform(y_val.values)
train_set = Dataset_Generation(train_dataset, train_labels, tokenizer, MAX_LEN)
test_set = Dataset_Generation(test_dataset, test_labels, tokenizer, MAX_LEN)
val_set = Dataset_Generation(val_dataset, val_labels, tokenizer, MAX_LEN)

TRAIN Dataset: (41304, 6)
TEST Dataset: (10326, 6)
VAL Dataset: (6641, 6)


In [137]:
first_item = train_set[0]

for key, value in first_item.items():
    print(f"{key}: {value}")

ids: tensor([   2, 4487,   46, 1419,   46,  765, 5475,   46, 5000, 6881, 7089,   46,
        2497,   46, 3257, 6897, 3114, 6983,  834, 3860,  912, 2499, 7799, 1958,
        7852, 4977, 5591,  258, 2423, 7489, 7096,  517, 6744, 7086,  881, 3105,
        5850, 3327, 1394, 5760, 4196, 1970, 7318, 3155, 6553,   54, 3805, 3257,
        5330, 3316, 6079, 7013, 5965, 4179, 6670, 3714, 7848, 6394, 5405, 6855,
          54,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    

In [138]:
", ".join(str(_) for _ in train_dataset.loc[0, :].values)

'청소년, 남성, 가족관계, 해당없음, 불안, 엄마에게 아빠와 같이 있는 것이 불편하다고 말해야 할까? 부친이 싫은 건 아닌데 역시 나와는 좀 맞지 않아서. 일단 엄마가 여유로울때 조심스럽게 이야기해봐야겠어.'

In [139]:
decoded_text = tokenizer.decode(first_item['ids'])
print(decoded_text)

[CLS] 청소년, 남성, 가족관계, 해당없음, 불안, 엄마에게 아빠와 같이 있는 것이 불편하다고 말해야 할까? 부친이 싫은 건 아닌데 역시 나와는 좀 맞지 않아서. 일단 엄마가 여유로울때 조심스럽게 이야기해봐야겠어.[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD

In [141]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8,
                'pin_memory' :True
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8,
                'pin_memory' :True
                }

train_loader = DataLoader(train_set, **train_params)
test_loader = DataLoader(test_set, **test_params)
val_loader = DataLoader(test_set, **test_params)

In [142]:
from typing import Any
import pytorch_lightning as pl
from transformers import get_scheduler
from torch.optim import AdamW, Optimizer

class BERTLightningModule(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.save_hyperparameters()
        self.l1 = DistilBertModel.from_pretrained('monologg/distilkobert')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, num_unique_labels)
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

    def training_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids, mask)
        loss = self.loss_function(outputs, targets)

        self.log('train_loss', loss)
        self.log('train_acc', self.calculate_accuracy(outputs, targets), prog_bar=True)

        return loss

    def test_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids, mask)
        loss = self.loss_function(outputs, targets)

        self.log('test_loss', loss)
        self.log('test_acc', self.calculate_accuracy(outputs, targets), prog_bar=True)

    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids, mask)
        loss = self.loss_function(outputs, targets)

        self.log('val_loss', loss)
        self.log('val_acc', self.calculate_accuracy(outputs, targets), prog_bar=True)

    def configure_optimizers(self) -> tuple[list[Optimizer], list[dict[str, Any]]]:
        do_decay = [p for p in self.parameters() if p.requires_grad and p.ndim >= 2]
        no_decay = [p for p in self.parameters() if p.requires_grad and p.ndim < 2]
        param_groups = [{"params": do_decay}, {"params": no_decay, "weight_decay": 0.0}]

        optimizer = AdamW(param_groups, **self.config.optim.optimizer)
        scheduler = get_scheduler(optimizer=optimizer, **self.config.optim.scheduler)
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

    @staticmethod
    def calculate_accuracy(preds, targets):
        _, predicted = torch.max(preds, dim=1)
        correct = torch.sum(predicted == targets)
        acc = correct.float() / targets.size(0)
        return acc

    def train_dataloader(self):
        return train_loader

    def test_dataloader(self):
        return test_loader

    def val_dataloader(self):
        return val_loader

In [143]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

checkpoint = ModelCheckpoint(
    monitor="val_loss", mode="min", save_weights_only=True
)

trainer = Trainer(
    accelerator="gpu",
    devices="auto",
    precision=16,
    log_every_n_steps=1000,
    max_epochs=2,
    max_steps=5000,
    gradient_clip_val=0,
    accumulate_grad_batches=1,
    val_check_interval=1.0,
    callbacks=[checkpoint, LearningRateMonitor("step")],
)

from easydict import EasyDict

config = EasyDict({})
config.optim = {}
config.optim.optimizer = {}
config.optim.optimizer.lr = 1e-4
config.optim.optimizer.betas = [0.9, 0.999]
config.optim.optimizer.eps = 1e-6
config.optim.optimizer.weight_decay = 0.01
config.optim.scheduler = {}
config.optim.scheduler.name = "linear"
config.optim.scheduler.num_warmup_steps = 1000
config.optim.scheduler.num_training_steps = 5000

trainer.fit(BERTLightningModule(config), train_loader, test_loader)

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
Some weights of the model checkpoint at monologg/distilkobert were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClas

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [144]:
trainer.test(ckpt_path=checkpoint.best_model_path, dataloaders=[val_loader])

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_13/checkpoints/epoch=1-step=2582.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_13/checkpoints/epoch=1-step=2582.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 1.557194471359253, 'test_acc': 0.4913809895515442}]