In [None]:
!pip install transformers -q
!pip install datasets -q

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

import torch
import torch.nn.functional as F
from tqdm import tqdm
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import Adam
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore')

[K     |████████████████████████████████| 5.8 MB 3.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 40.0 MB/s 
[K     |████████████████████████████████| 182 kB 57.8 MB/s 
[K     |████████████████████████████████| 451 kB 33.8 MB/s 
[K     |████████████████████████████████| 212 kB 77.5 MB/s 
[K     |████████████████████████████████| 132 kB 64.0 MB/s 
[K     |████████████████████████████████| 127 kB 76.9 MB/s 
[?25h

In [None]:
# GPU 사용가능 여부 체크
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

# 하이퍼파라미터 설정 및 시드 고정

In [None]:
# 하이퍼파라미터 설정

EPOCHS = 10 # 반복 횟수
LR = 1e-5 # 학습률
BS = 8 # 배치 크기
SEED = 41 # 랜덤 시드

In [None]:
# 함수를 정의하여 모든 시드를 사전에 고정시킴
# 앞에서 설정한 SEED(랜덤 시드) 사용

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(SEED)

# 데이터 로드

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Data/발화자의 감정인식 AI

Mounted at /content/drive
/content/drive/MyDrive/Data/발화자의 감정인식 AI


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise
...,...,...,...,...,...
9984,TRAIN_9984,You or me?,Chandler,1038,neutral
9985,TRAIN_9985,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,1038,neutral
9986,TRAIN_9986,"You guys are messing with me, right?",Joey,1038,surprise
9987,TRAIN_9987,Yeah.,All,1038,neutral


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9989 entries, 0 to 9988
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           9989 non-null   object
 1   Utterance    9989 non-null   object
 2   Speaker      9989 non-null   object
 3   Dialogue_ID  9989 non-null   int64 
 4   Target       9989 non-null   object
dtypes: int64(1), object(4)
memory usage: 390.3+ KB


In [None]:
test

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1
...,...,...,...,...
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachell,279
2606,TEST_2606,They don't listen to me?,Rossi,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachell,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Rossi,279


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2610 entries, 0 to 2609
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2610 non-null   object
 1   Utterance    2610 non-null   object
 2   Speaker      2610 non-null   object
 3   Dialogue_ID  2610 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 81.7+ KB


# Preprocessing

## test의 일부 speaker 수정정

In [None]:
test['Speaker'].value_counts().head(10)

Joeyy       411
Chadler     379
Rossi       373
Rachell     356
Mornica     346
Phoebe      291
Janice       31
Emily        16
Director     16
Gunther      13
Name: Speaker, dtype: int64

In [None]:
for _ in tqdm(range(len(test))):
    test['Speaker'].replace('Joeyy', 'Joey', inplace=True)
    test['Speaker'].replace('Chadler', 'Chandler', inplace=True)
    test['Speaker'].replace('Rossi', 'Ross', inplace=True)
    test['Speaker'].replace('Rachell', 'Rachel', inplace=True)
    test['Speaker'].replace('Mornica', 'Monica', inplace=True)

100%|██████████| 2610/2610 [00:03<00:00, 765.89it/s]


In [None]:
test['Speaker'].value_counts().head(10)

Joey        411
Chandler    379
Ross        373
Rachel      356
Monica      346
Phoebe      291
Janice       31
Emily        16
Director     16
Gunther      13
Name: Speaker, dtype: int64

In [None]:
test

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachel,0
2,TEST_0002,You know what?,Rachel,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joey,1
4,TEST_0004,To push!,Joey,1
...,...,...,...,...
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachel,279
2606,TEST_2606,They don't listen to me?,Ross,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachel,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Ross,279


## train, test의 Utterance 변형

 train, test의 'Speaker' column을 'Utterance'에 merge

### train

In [None]:
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


In [None]:
train['Utterance'] = train['Speaker'] + " : " + train['Utterance']

In [None]:
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,Chandler : also I was the point person on my c...,Chandler,0,neutral
1,TRAIN_0001,The Interviewer : You must’ve had your hands f...,The Interviewer,0,neutral
2,TRAIN_0002,Chandler : That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,The Interviewer : So let’s talk a little bit a...,The Interviewer,0,neutral
4,TRAIN_0004,Chandler : My duties? All right.,Chandler,0,surprise


### test

In [None]:
test.tail()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachel,279
2606,TEST_2606,They don't listen to me?,Ross,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachel,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Ross,279
2609,TEST_2609,I think you look good.,Monica,279


In [None]:
test['Utterance'] = test['Speaker'] + " : " + test['Utterance']

In [None]:
test.tail()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
2605,TEST_2605,"Rachel : Yeah, I mean, go Ross, no one will ev...",Rachel,279
2606,TEST_2606,Ross : They don't listen to me?,Ross,279
2607,TEST_2607,"Rachel : Of course, they listen to you! Everyo...",Rachel,279
2608,TEST_2608,"Ross : Monica, do you really think I should tr...",Ross,279
2609,TEST_2609,Monica : I think you look good.,Monica,279


## Label encoding  
sklearn.preprocessing의 LabelEncoder() 사용

In [None]:
train['Target']

0        neutral
1        neutral
2        neutral
3        neutral
4       surprise
          ...   
9984     neutral
9985     neutral
9986    surprise
9987     neutral
9988         joy
Name: Target, Length: 9989, dtype: object

In [None]:
# train의 Target column을 범주로 변환

le = LabelEncoder()
le = le.fit(train['Target'])
train['Target'] = le.transform(train['Target'])

In [None]:
train['Target']

0       4
1       4
2       4
3       4
4       6
       ..
9984    4
9985    4
9986    6
9987    4
9988    3
Name: Target, Length: 9989, dtype: int64

In [None]:
# 범주 종류 확인

le.classes_

array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness',
       'surprise'], dtype=object)

## Train/Validation set 분리  
train의 'Dialogue_ID' column을 98:2(1016/23)로 분리

In [None]:
valid = train[~train['Dialogue_ID'].isin([i for i in range(1016)])].reset_index(drop=True)
train = train[train['Dialogue_ID'].isin([i for i in range(1016)])].reset_index(drop=True)

train_len = len(train)
valid_len = len(valid)

print(train_len)
print(valid_len)

9725
264


## CustomDataset 클래스 생성
EDA에서 파악한 정수 인코딩된 문장 길이를 토대로 128로 지정하였음

In [None]:
# 사용할 토크나이저 지정

tokenizer = AutoTokenizer.from_pretrained('tae898/emoberta-base')

Downloading:   0%|          | 0.00/407 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, mode='train'):
        self.dataset = data
        self.tokenizer = tokenizer
        self.mode = mode
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        #text = text.lower()
        inputs = self.tokenizer(text, padding='max_length', max_length=128,
                               truncation=True, return_tensors='pt')
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
        
        if self.mode == 'train':
            y = self.dataset['Target'][idx]
            return input_ids, attention_mask, y
        else:
            return input_ids, attention_mask

In [None]:
# train/valid 데이터셋, 데이터로더 생성

train = CustomDataset(train, mode='train')
valid = CustomDataset(valid, mode='train')

train_dataloader = DataLoader(train, batch_size=BS, shuffle=True)
valid_dataloader = DataLoader(valid, batch_size=BS, shuffle=False)

# Modeling

## 주요 함수 정의

### class BaseModel 정의

In [None]:
class BaseModel(nn.Module):
    def __init__(self, dropout=0.5, num_classes=len(le.classes_)):
        super(BaseModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained('tae898/emoberta-base')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes) # 768: base 임베딩 벡터의 hidden layer 차원
        self.relu = nn.ReLU()
        
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        
        return final_layer

### train 함수 정의

In [None]:
# EPOCHS 사용

def train(model, optimizer, train_loader, test_loader, device):
    model.to(device) 
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_score = 0

    
    for epoch_num in range(EPOCHS): 
        model.train() # train 모드로 변경
        train_loss = []
        for input_ids, attention_mask, train_label in tqdm(train_loader):
            optimizer.zero_grad()
            
            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)
            
            output = model(input_id, mask) # forward 함수와 입력 형식 맞춰줌
            
            batch_loss = criterion(output, train_label.long())
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()
            
        val_loss, val_score = validation(model, criterion, test_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')
        
        if best_score < val_score:
            #best_model = model
            best_score = val_score
            torch.save(model.state_dict(), 'emoberta_model_lr5e-5.pth',
                      _use_new_zipfile_serialization=False)

### validation 함수 정의

In [None]:
def validation(model, criterion, test_loader, device):
    model.eval() # evaluation 모드로 변경
    val_loss, model_preds, true_labels = [], [], []
    
    # pytorch의 autograd engine을 비활성화하여 gradient를 계산하지 않도록 함
    # 보통 model.eval()과 함께 쓰임
    with torch.no_grad():
        for input_ids, attention_mask, valid_label in tqdm(test_loader):
            valid_label = valid_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)
            
            output = model(input_id, mask)
            
            batch_loss = criterion(output, valid_label.long())
            val_loss.append(batch_loss.item())
            
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
        val_f1 = f1_score(true_labels, model_preds, average="macro")
    
    return val_loss, val_f1

## 모델 학습

In [None]:
# LR 사용

model = BaseModel()
model.eval()
optimizer = Adam(params = model.parameters(), lr=LR)

train(model, optimizer, train_dataloader, valid_dataloader, device)

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at tae898/emoberta-base were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at tae898/emoberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1216/12

Epoch [0], Train Loss : [0.99561] Val Loss : [0.90806] Val F1 Score : [0.47121]


100%|██████████| 1216/1216 [04:05<00:00,  4.94it/s]
100%|██████████| 33/33 [00:01<00:00, 18.13it/s]


Epoch [1], Train Loss : [0.79553] Val Loss : [0.87281] Val F1 Score : [0.56110]


100%|██████████| 1216/1216 [04:04<00:00,  4.97it/s]
100%|██████████| 33/33 [00:01<00:00, 18.14it/s]


Epoch [2], Train Loss : [0.64929] Val Loss : [0.93471] Val F1 Score : [0.58350]


100%|██████████| 1216/1216 [04:04<00:00,  4.97it/s]
100%|██████████| 33/33 [00:01<00:00, 18.22it/s]


Epoch [3], Train Loss : [0.54071] Val Loss : [1.02007] Val F1 Score : [0.53028]


100%|██████████| 1216/1216 [04:04<00:00,  4.98it/s]
100%|██████████| 33/33 [00:01<00:00, 18.16it/s]


Epoch [4], Train Loss : [0.44679] Val Loss : [1.09986] Val F1 Score : [0.57092]


100%|██████████| 1216/1216 [04:04<00:00,  4.97it/s]
100%|██████████| 33/33 [00:01<00:00, 18.24it/s]


Epoch [5], Train Loss : [0.37901] Val Loss : [1.29543] Val F1 Score : [0.48619]


100%|██████████| 1216/1216 [04:04<00:00,  4.97it/s]
100%|██████████| 33/33 [00:01<00:00, 18.14it/s]


Epoch [6], Train Loss : [0.32606] Val Loss : [1.25362] Val F1 Score : [0.49868]


100%|██████████| 1216/1216 [04:04<00:00,  4.98it/s]
100%|██████████| 33/33 [00:01<00:00, 18.13it/s]


Epoch [7], Train Loss : [0.28857] Val Loss : [1.36517] Val F1 Score : [0.44543]


100%|██████████| 1216/1216 [04:04<00:00,  4.98it/s]
100%|██████████| 33/33 [00:01<00:00, 18.19it/s]


Epoch [8], Train Loss : [0.25804] Val Loss : [1.40994] Val F1 Score : [0.51641]


100%|██████████| 1216/1216 [04:04<00:00,  4.98it/s]
100%|██████████| 33/33 [00:01<00:00, 18.15it/s]

Epoch [9], Train Loss : [0.24039] Val Loss : [1.39087] Val F1 Score : [0.48976]





```
Epoch = 5 / BS = 16
Epoch [0], Train Loss : [0.99802] Val Loss : [0.92437] Val F1 Score : [0.44305]
100%|██████████| 1216/1216 [01:31<00:00, 13.34it/s]
100%|██████████| 33/33 [00:00<00:00, 44.52it/s]
Epoch [1], Train Loss : [0.80344] Val Loss : [0.90892] Val F1 Score : [0.54457]
100%|██████████| 1216/1216 [01:30<00:00, 13.42it/s]
100%|██████████| 33/33 [00:00<00:00, 44.45it/s]
Epoch [2], Train Loss : [0.66720] Val Loss : [0.92293] Val F1 Score : [0.55155]
100%|██████████| 1216/1216 [01:31<00:00, 13.35it/s]
100%|██████████| 33/33 [00:00<00:00, 44.70it/s]
Epoch [3], Train Loss : [0.55763] Val Loss : [1.00066] Val F1 Score : [0.55593]
100%|██████████| 1216/1216 [01:31<00:00, 13.32it/s]
100%|██████████| 33/33 [00:00<00:00, 44.30it/s]Epoch [4], Train Loss : [0.45546] Val Loss : [1.16590] Val F1 Score : [0.55022]
```

## 모델 로드

In [None]:
infer_model = BaseModel()
infer_model.load_state_dict(torch.load('emoberta_model_lr5e-5.pth'))
infer_model.eval()

Some weights of the model checkpoint at tae898/emoberta-base were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at tae898/emoberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BaseModel(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

## 모델 예측

### test 데이터로더 생성

In [None]:
test = CustomDataset(test, mode="test")
test_dataloader = DataLoader(test, batch_size=BS, shuffle=False)

### inference 함수 정의

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval() # evaluation 모드로 변경
    test_predict = []
    
    for input_ids, attention_mask in tqdm(test_loader):
        input_id = input_ids.to(device)
        mask = attention_mask.to(device)
        
        y_pred = model(input_id, mask)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()

    return test_predict

In [None]:
preds = inference(infer_model, test_dataloader, device)

100%|██████████| 327/327 [00:17<00:00, 18.29it/s]

Done.





# Result

In [None]:
# 앞에 10개만 label 확인

print(preds[:10])

[6, 4, 4, 4, 3, 3, 3, 3, 5, 3]


In [None]:
# LabelEncoder로 숫자로 인코딩했던 값을 다시 원래 문자 label로 변환

preds = le.inverse_transform(preds)
preds[:10]

array(['surprise', 'neutral', 'neutral', 'neutral', 'joy', 'joy', 'joy',
       'joy', 'sadness', 'joy'], dtype=object)

## 예측 결과물 Sumit

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit.head()

Unnamed: 0,ID,Target
0,TEST_0000,NAN
1,TEST_0001,NAN
2,TEST_0002,NAN
3,TEST_0003,NAN
4,TEST_0004,NAN


In [None]:
submit['Target'] = preds
submit.head()

Unnamed: 0,ID,Target
0,TEST_0000,surprise
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy


In [None]:
submit.to_csv('emoberta_epoch_10.csv', index=False)