In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/NLP_project

/content/drive/MyDrive/NLP_project


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os

In [4]:
path = os.getcwd()

train_path = path + '/data/' + 'train.txt'
valid_path = path + '/data/' + 'valid.txt'
test_path = path  + '/data/' + 'test.txt'

In [5]:
"""Dataset Loading"""
dataclass = "types"
batch_size = 16
sample = 1.0

num_workers = 2

clsNum = 8

## utils

In [6]:
def encode_right_truncated(text, tokenizer, max_length=300):
    tokenized = tokenizer.tokenize(text, max_length = max_length, truncation=True)
    truncated = tokenized[-max_length:]
    ids = tokenizer.convert_tokens_to_ids(truncated)
    
    return [tokenizer.cls_token_id] + ids

In [7]:
def padding(ids_list, tokenizer):
    max_len = 0
    for ids in ids_list:
        if len(ids) > max_len:
            max_len = len(ids)
    
    pad_ids = []
    for ids in ids_list:
        pad_len = max_len-len(ids)
        add_ids = [tokenizer.pad_token_id for _ in range(pad_len)]
        
        pad_ids.append(ids+add_ids)
    
    return torch.tensor(pad_ids)

In [8]:
def Make_batch(sessions):

    batch_input, batch_labels = [], []
    for session in sessions:
        data = session[0]
        label_list = session[1]
        
        utt, emotion, sentiment = data        
        batch_input.append(encode_right_truncated(utt.strip(), KcELECTRA_tokenizer))
        
        if len(label_list) > 3:
            label_ind = label_list.index(emotion)
        else:
            label_ind = label_list.index(sentiment)
        batch_labels.append(label_ind)
    
    batch_input_tokens = padding(batch_input, KcELECTRA_tokenizer)
    batch_labels = torch.tensor(batch_labels)    
    
    return batch_input_tokens, batch_labels

In [9]:
def CELoss(pred_outs, labels):
    """
        pred_outs: [batch, clsNum]
        labels: [batch]
    """
    loss = nn.CrossEntropyLoss()
    loss_val = loss(pred_outs, labels)
    return loss_val

In [10]:
def _CalACC(model, dataloader):
    model.eval()
    correct = 0
    label_list = []
    pred_list = []
    
    # label arragne
    with torch.no_grad():
        for i_batch, data in enumerate(dataloader):            
            """Prediction"""
            batch_input_tokens, batch_labels = data
            batch_input_tokens, batch_labels = batch_input_tokens.cuda(), batch_labels.cuda()
            
            pred_logits = model(batch_input_tokens) # (1, clsNum)
            
            """Calculation"""    
            pred_label = pred_logits.argmax(1).item()
            true_label = batch_labels.item()
            
            pred_list.append(pred_label)
            label_list.append(true_label)
            if pred_label == true_label:
                correct += 1
        acc = correct/len(dataloader)
    return acc, pred_list, label_list

In [11]:
def _SaveModel(model, path):
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(model.state_dict(), os.path.join(path, 'model.pt'))

## Model

In [12]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [13]:
# KcELECTRA
from transformers import AutoModel, AutoTokenizer
KcELECTRA_model = AutoModel.from_pretrained("beomi/KcELECTRA-base-v2022")
KcELECTRA_tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")

Downloading (…)lve/main/config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base-v2022 were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [24]:
class ERC_model(nn.Module):
    def __init__(self, clsNum):
        super(ERC_model, self).__init__()
        
        self.gpu = True
        
        """Model Setting"""
        # KcELECTRA
        self.model = KcELECTRA_model
        tokenizer = KcELECTRA_tokenizer

        tokenizer.add_special_tokens({'cls_token': '[CLS]', 'pad_token': '[PAD]'})
        self.model.resize_token_embeddings(len(tokenizer))
        self.hiddenDim = self.model.config.hidden_size
        

        """score"""
        self.W = nn.Linear(self.hiddenDim, clsNum)
   
            
    def forward(self, batch_input_tokens):
        """
            batch_input_tokens: (batch, len)
        """
        
        batch_context_output = self.model(batch_input_tokens).last_hidden_state[:,0,:] # (batch, 768)
        context_logit = self.W(batch_context_output) # (batch, clsNum)        
        return context_logit


In [25]:
class KoEthcis_loader(Dataset):
    def __init__(self, txt_file, dataclass):
        self.dialogs = []

        f = open(txt_file, 'r', encoding = "utf-8")
        dataset = f.readlines()
        f.close()

        temp_speakerList = []
        context = []
        context_speaker = []
        self.speakerNum = []
        
        types_dict = {
            "['CENSURE']":'CENSURE', "['HATE']":'HATE',
            "['DISCRIMINATION']":'DISCRIMINATION', "['SEXUAL']":'SEXUAL',
            "['ABUSE']":'ABUSE', "['VIOLENCE']":'VIOLENCE',
            "['CRIME']":'CRIME', "['IMMORAL_NONE']":'IMMORAL_NONE'}
        self.immoral_dict = {
            'True': ['CENSURE', 'HATE', 'DISCRIMINATION',
                     'SEXUAL', 'ABUSE', 'VIOLENCE', 'CRIME'],
            'False': ['IMMORAL_NONE']}

        self.typesSet = set(types_dict.values())
        self.immoralSet = set()

        for i, data in enumerate(dataset):
            if i < 2:
                continue
            if data == '\n' and len(self.dialogs) > 0:
                continue
          
            ID, speaker, utt, ethics_types, immoral = data.strip().split('\t')
            
            

            self.dialogs.append([utt, types_dict[ethics_types], immoral])
            self.typesSet.add(types_dict[ethics_types])
            self.immoralSet.add(immoral)

        self.typesList = sorted(self.typesSet)
        self.immoralList = sorted(self.immoralSet)

        if dataclass == 'types':
            self.labelList = self.typesList
        else:
            self.labelList = self.immoralList        
        self.speakerNum.append(len(temp_speakerList))
        
    def __len__(self):
        return len(self.dialogs)

    def __getitem__(self, idx):
        return self.dialogs[idx], self.labelList, self.immoralList

In [27]:
model = ERC_model(clsNum)
model = model.cuda()
model.train()

train_dataset = KoEthcis_loader(train_path, dataclass)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=Make_batch)
train_sample_num = int(len(train_dataset)*sample)

test_dataset = KoEthcis_loader(test_path, dataclass)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=Make_batch)

valid_dataset = KoEthcis_loader(valid_path, dataclass)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=Make_batch)

## Train

In [28]:
"""Training Setting"""     

from transformers import get_linear_schedule_with_warmup

training_epochs = 10
save_term = int(training_epochs/5)
max_grad_norm = 10
lr = 1e-5
num_training_steps = len(train_dataset)*training_epochs
num_warmup_steps = len(train_dataset)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr) # , eps=1e-06, weight_decay=0.01
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

save_path = path+'/model'

In [29]:
"""Input & Label Setting"""
best_vaild_fscore, best_test_fscore = 0, 0
best_vaild_fscore_macro, best_vaild_fscore_micro, best_test_fscore_macro, best_test_fscore_micro = 0, 0, 0, 0    
best_epoch = 0

In [30]:
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

In [None]:
import time

for epoch in tqdm(range(training_epochs)):
    start_time = time.time()  # Start time of the epoch

    model.train() 
    for i_batch, data in enumerate(train_dataloader):
        if i_batch > train_sample_num:
            print(i_batch, train_sample_num)
            break
        
        """Prediction"""
        batch_input_tokens, batch_labels = data
        batch_input_tokens, batch_labels = batch_input_tokens.cuda(), batch_labels.cuda()
        
        pred_logits = model(batch_input_tokens)

        """Loss calculation & training"""
        loss_val = CELoss(pred_logits, batch_labels)
        
        loss_val.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        
    """Valid & Test evaluation"""
    model.eval()
    val_acc, val_pred_list, val_label_list = _CalACC(model, valid_dataloader)
    
    val_pre, val_rec, val_fbeta, _ = precision_recall_fscore_support(val_label_list, val_pred_list, average='weighted')

    """Best Score & Model Save"""
    if val_fbeta > best_vaild_fscore:
        best_valid_fscore = val_fbeta
        
        test_acc, test_pred_list, test_label_list = _CalACC(model, test_dataloader)
        test_pre, test_rec, test_fbeta, _ = precision_recall_fscore_support(test_label_list, test_pred_list, average='weighted')                
        
        best_epoch = epoch
        _SaveModel(model, save_path)

    print('Epoch: {}'.format(epoch))
    print('Devleopment ## accuracy: {}, precision: {}, recall: {}, fscore: {}'.format(val_acc, val_pre, val_rec, val_fbeta))
    print()

print('Final Fscore ## test-accuracy: {}, test-fscore: {}, test_epoch: {}'.format(test_acc, test_fbeta, best_epoch))

  0%|          | 0/10 [00:00<?, ?it/s]