In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP_project

/content/drive/MyDrive/NLP_project


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os

In [None]:
path = os.getcwd()

train_path = path + '/data/' + 'train.txt'
valid_path = path + '/data/' + 'valid.txt'
test_path = path  + '/data/' + 'test.txt'

In [None]:
"""Dataset Loading"""
dataclass = "types"
batch_size = 16
sample = 1.0

num_workers = 2

clsNum = 8

## utils

In [None]:
def encode_right_truncated(text, tokenizer, max_length=300):
    tokenized = tokenizer.tokenize(text, max_length = max_length, truncation=True)
    truncated = tokenized[-max_length:]
    ids = tokenizer.convert_tokens_to_ids(truncated)
    
    return [tokenizer.cls_token_id] + ids

In [None]:
def padding(ids_list, tokenizer):
    max_len = 0
    for ids in ids_list:
        if len(ids) > max_len:
            max_len = len(ids)
    
    pad_ids = []
    for ids in ids_list:
        pad_len = max_len-len(ids)
        add_ids = [tokenizer.pad_token_id for _ in range(pad_len)]
        
        pad_ids.append(ids+add_ids)
    
    return torch.tensor(pad_ids)

In [None]:
def Make_batch(sessions):
    batch_input, batch_labels, batch_speaker_tokens = [], [], []
    for session in sessions:
        data = session[0]
        label_list = session[1]
        
        context_speaker, utt, ethics_types, immoral = data
        now_speaker = context_speaker[-1]
        speaker_utt_list = []

        inputString = ""
        for turn, (speaker, utt) in enumerate(zip(context_speaker, utt)):
            inputString += '<s' + str(speaker+1) + '> ' # s1, s2, s3...
            inputString += utt + " "

            if turn<len(context_speaker)-1 and speaker == now_speaker:
                speaker_utt_list.append(encode_right_truncated(utt, KcELECTRA_tokenizer, max_length=511))
        
        concat_string = inputString.strip()
        batch_input.append(encode_right_truncated(concat_string, KcELECTRA_tokenizer, max_length=511))
        
        if len(label_list) > 3:
            label_ind = label_list.index(ethics_types)
        else:
            label_ind = label_list.index(immoral)
        batch_labels.append(label_ind)        
        
        batch_speaker_tokens.append(padding(speaker_utt_list, KcELECTRA_tokenizer))
    
    batch_input_tokens = padding(batch_input, KcELECTRA_tokenizer)
    batch_labels = torch.tensor(batch_labels)

    if len(batch_input_tokens) > 512:
      batch_input_tokens = batch_input_tokens[:512]
    
    return batch_input_tokens, batch_labels, batch_speaker_tokens

In [None]:
def CELoss(pred_outs, labels):
    """
        pred_outs: [batch, clsNum]
        labels: [batch]
    """
    loss = nn.CrossEntropyLoss()
    loss_val = loss(pred_outs, labels)
    return loss_val

In [None]:
def _CalACC(model, dataloader):
    model.eval()
    correct = 0
    label_list = []
    pred_list = []
    
    # label arragne
    with torch.no_grad():
        for i_batch, data in enumerate(dataloader):            
            """Prediction"""
            batch_input_tokens, batch_labels, batch_speaker_tokens = data
            batch_input_tokens, batch_labels = batch_input_tokens.cuda(), batch_labels.cuda()
            
            pred_logits = model(batch_input_tokens, batch_speaker_tokens) # (1, clsNum)
            
            """Calculation"""    
            pred_label = pred_logits.argmax(1).item()
            true_label = batch_labels.item()
            
            pred_list.append(pred_label)
            label_list.append(true_label)
            if pred_label == true_label:
                correct += 1
        acc = correct/len(dataloader)
    return acc, pred_list, label_list

In [None]:
def _SaveModel(model, path):
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(model.state_dict(), os.path.join(path, 'model.pt'))

## Model

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m119.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
# KcELECTRA
from transformers import AutoModel, AutoTokenizer
KcELECTRA_model = AutoModel.from_pretrained("beomi/KcELECTRA-base-v2022")
KcELECTRA_tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")

Downloading (…)lve/main/config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base-v2022 were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
class ERC_model(nn.Module):
    def __init__(self, clsNum):
        super(ERC_model, self).__init__()
        self.gpu = True
        
        """Model Setting"""
        # KcELECTRA
        self.model = KcELECTRA_model
        tokenizer = KcELECTRA_tokenizer
        
        tokenizer.add_special_tokens({'cls_token': '[CLS]', 'pad_token': '[PAD]'})
        self.model.resize_token_embeddings(len(tokenizer))

        self.hiddenDim = self.model.config.hidden_size

        self.W = nn.Linear(self.hiddenDim, clsNum)

        zero = torch.empty(2, 1, self.hiddenDim).cuda()
        self.h0 = torch.zeros_like(zero) # (num_layers * num_directions, batch, hidden_size)
        self.speakerGRU = nn.GRU(self.hiddenDim, self.hiddenDim, 2, dropout=0.3) # (input, hidden, num_layer) (BERT_emb, BERT_emb, num_layer)

        """score"""
        self.W = nn.Linear(self.hiddenDim, clsNum)

        """parameters"""
        self.train_params = list(self.model.parameters())+list(self.speakerGRU.parameters())+list(self.W.parameters())

    def forward(self, batch_input_tokens, batch_speaker_tokens):
        """
            batch_input_tokens: (batch, len)
            batch_speaker_tokens: [(speaker_utt_num, len), ..., ]
        """
        
        batch_context_output = self.model(batch_input_tokens)[0][:,0,:] # (batch, 768)

        batch_speaker_output = []
        for speaker_tokens in batch_speaker_tokens:
            if speaker_tokens.shape[0] == 0:
                speaker_track_vector = torch.zeros(1, self.hiddenDim).cuda()
            else:
                speaker_output = self.model(speaker_tokens.cuda())[0][:,0,:] # (speaker_utt_num, 768)
                speaker_output = speaker_output.unsqueeze(1) # (speaker_utt_num, 1, 768)
                speaker_GRU_output, _ = self.speakerGRU(speaker_output, self.h0) # (speaker_utt_num, 1, 768) <- (seq_len, batch, output_size)
                speaker_track_vector = speaker_GRU_output[-1,:,:] # (1, 768)
            batch_speaker_output.append(speaker_track_vector)
        batch_speaker_output = torch.cat(batch_speaker_output, 0) # (batch, 768)

        final_output = batch_context_output + batch_speaker_output           
        context_logit = self.W(final_output) # (batch, clsNum)
        
        return context_logit

In [None]:
class KoEthcis_loader(Dataset):
    def __init__(self, txt_file, dataclass):
        self.dialogs = []

        f = open(txt_file, 'r', encoding = "utf-8")
        dataset = f.readlines()
        f.close()

        temp_speakerList = []
        context = []
        context_speaker = []
        self.speakerNum = []
        
        types_dict = {
            "['CENSURE']":'CENSURE', "['HATE']":'HATE',
            "['DISCRIMINATION']":'DISCRIMINATION', "['SEXUAL']":'SEXUAL',
            "['ABUSE']":'ABUSE', "['VIOLENCE']":'VIOLENCE',
            "['CRIME']":'CRIME', "['IMMORAL_NONE']":'IMMORAL_NONE'}
        self.immoral_dict = {
            'True': ['CENSURE', 'HATE', 'DISCRIMINATION',
                     'SEXUAL', 'ABUSE', 'VIOLENCE', 'CRIME'],
            'False': ['IMMORAL_NONE']}

        self.typesSet = set(types_dict.values())
        self.immoralSet = set()

        for i, data in enumerate(dataset):
            if i < 2:
                continue
            if data == '\n' and len(self.dialogs) > 0:
                self.speakerNum.append(len(temp_speakerList))
                temp_speakerList = []
                context = []
                context_speaker = []
                continue
          
            ID, speaker, utt, ethics_types, immoral = data.strip().split('\t')
            context.append(utt)
            if speaker not in temp_speakerList:
                temp_speakerList.append(speaker)
            speakerCLS = temp_speakerList.index(speaker)
            context_speaker.append(speakerCLS)

            self.dialogs.append([context_speaker[:], context[:], types_dict[ethics_types], immoral])
            self.typesSet.add(types_dict[ethics_types])
            self.immoralSet.add(immoral)

        self.typesList = sorted(self.typesSet)
        self.immoralList = sorted(self.immoralSet)

        if dataclass == 'types':
            self.labelList = self.typesList
        else:
            self.labelList = self.immoralList        
        self.speakerNum.append(len(temp_speakerList))
        
    def __len__(self):
        return len(self.dialogs)

    def __getitem__(self, idx):
        return self.dialogs[idx], self.labelList, self.immoralList

In [31]:
model = ERC_model(clsNum)
model = model.cuda()
model.train()

train_dataset = KoEthcis_loader(train_path, dataclass)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=Make_batch)
train_sample_num = int(len(train_dataset)*sample)

test_dataset = KoEthcis_loader(test_path, dataclass)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=Make_batch)

valid_dataset = KoEthcis_loader(valid_path, dataclass)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=Make_batch)

## Train

In [None]:
"""Training Setting"""     

from transformers import get_linear_schedule_with_warmup

training_epochs = 10
save_term = int(training_epochs/5)
max_grad_norm = 10
lr = 1e-5
num_training_steps = len(train_dataset)*training_epochs
num_warmup_steps = len(train_dataset)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr) # , eps=1e-06, weight_decay=0.01
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

save_path = path+'/model'

In [None]:
"""Input & Label Setting"""
best_vaild_fscore, best_test_fscore = 0, 0
best_vaild_fscore_macro, best_vaild_fscore_micro, best_test_fscore_macro, best_test_fscore_micro = 0, 0, 0, 0    
best_epoch = 0

In [None]:
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

In [33]:
for epoch in tqdm(range(training_epochs)):
    model.train() 
    for i_batch, data in enumerate(train_dataloader):
        if i_batch > train_sample_num:
            print(i_batch, train_sample_num)
            break
        
        """Prediction"""
        batch_input_tokens, batch_labels, batch_speaker_tokens = data
        batch_input_tokens, batch_labels = batch_input_tokens.cuda(), batch_labels.cuda()
        
        pred_logits = model(batch_input_tokens, batch_speaker_tokens)

        """Loss calculation & training"""
        loss_val = CELoss(pred_logits, batch_labels)
        
        loss_val.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    """Valid & Test evaluation"""
    model.eval()
    val_acc, val_pred_list, val_label_list = _CalACC(model, valid_dataloader)
    
    val_pre, val_rec, val_fbeta, _ = precision_recall_fscore_support(val_label_list, val_pred_list, average='weighted')

    """Best Score & Model Save"""
    if val_fbeta > best_vaild_fscore:
        best_valid_fscore = val_fbeta
        
        test_acc, test_pred_list, test_label_list = _CalACC(model, test_dataloader)
        test_pre, test_rec, test_fbeta, _ = precision_recall_fscore_support(test_label_list, test_pred_list, average='weighted')                
        
        best_epoch = epoch
        _SaveModel(model, save_path)

    print('Epoch: {}'.format(epoch))
    print('Devleopment ## accuracy: {}, precision: {}, recall: {}, fscore: {}'.format(val_acc, val_pre, val_rec, val_fbeta))
    print()

print('Final Fscore ## test-accuracy: {}, test-fscore: {}, test_epoch: {}'.format(test_acc, test_fbeta, best_epoch))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 10%|█         | 1/10 [00:14<02:08, 14.33s/it]

Epoch: 0
Devleopment ## accuracy: 0.4594594594594595, precision: 0.3540540540540541, recall: 0.4594594594594595, fscore: 0.3499288762446657



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 20%|██        | 2/10 [00:25<01:40, 12.60s/it]

Epoch: 1
Devleopment ## accuracy: 0.5405405405405406, precision: 0.46015843429636527, recall: 0.5405405405405406, fscore: 0.4602784602784603



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 30%|███       | 3/10 [00:37<01:24, 12.04s/it]

Epoch: 2
Devleopment ## accuracy: 0.4594594594594595, precision: 0.37201907790143085, recall: 0.4594594594594595, fscore: 0.4111207982175724



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 40%|████      | 4/10 [00:47<01:08, 11.36s/it]

Epoch: 3
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 50%|█████     | 5/10 [00:57<00:54, 10.80s/it]

Epoch: 4
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 60%|██████    | 6/10 [01:07<00:42, 10.65s/it]

Epoch: 5
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 70%|███████   | 7/10 [01:21<00:34, 11.57s/it]

Epoch: 6
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 80%|████████  | 8/10 [01:30<00:22, 11.01s/it]

Epoch: 7
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 90%|█████████ | 9/10 [01:41<00:10, 10.99s/it]

Epoch: 8
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 10/10 [01:53<00:00, 11.31s/it]

Epoch: 9
Devleopment ## accuracy: 0.5135135135135135, precision: 0.4126447876447876, recall: 0.5135135135135135, fscore: 0.4570732895057219

Final Fscore ## test-accuracy: 0.5, test-fscore: 0.47413127413127415, test_epoch: 9



