In [1]:
import os
import sys
import json
import getopt

import torch
import random
import numpy as np
import torch.backends.cudnn as cudnn

from tqdm import tqdm
from collections import defaultdict

from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.nn import CosineEmbeddingLoss

from transformers import AutoTokenizer
from datasets.dataset import CustomDataset, get_csv_data
from sklearn.metrics.pairwise import cosine_similarity

from models.roberta_encoder import RobertaModel
from logger.mylogger import set_logger

In [2]:
with open('configs/sample.json', 'r') as f:
    config = json.load(f)
print(config)

{'data_args': {'path': 'generated_questions_1068.csv', 'eval_data_path': '../test_data/mirae_test.csv', 'mask_eval_data_path': '../test_data/unk_mrn_test.csv', 'eval_att_list': {'수학 용어': ['설명', '페이지', '단원 번호', '용어'], '수학 단원': ['대단원', '소단원', '학습 목표', '페이지', '단원 번호'], '수학 문제': ['문제 내용', '정답 페이지', '풀이', '페이지', '단원 번호']}, 'shuffle': True}, 'model_args': {'backbone': 'klue/roberta-base', 'tokenizer': 'klue/roberta-base'}, 'train_args': {'version': 'v20', 'masking_token': '[UNK]', 'batch_size': 32, 'learning_rate': 5e-06, 'margin': 0.5, 'epoch': 20, 'early_stopping': 3}, 'gpu': {'type': 'single', 'number': '1'}, 'Desc': '777, [UNK] 활용, GN: 5개, HN: 최대 20개, Dev 데이터 추가, learning_rate 1e-6로 변경'}


In [3]:

logger = set_logger(config["data_args"]["path"], config["train_args"]["version"])
# logger.info(f"config file: {arg}")
logger.info(f"config content: {config}")

# if config['gpu']['type'] == 'single':
#     os.environ["CUDA_DEVICE_ORDER"]= "PCI_BUS_ID"
#     os.environ["CUDA_VISIBLE_DEVICES"]= config['gpu']['number']

2024-04-28 21:31:51 - INFO - config content: {'data_args': {'path': 'generated_questions_1068.csv', 'eval_data_path': '../test_data/mirae_test.csv', 'mask_eval_data_path': '../test_data/unk_mrn_test.csv', 'eval_att_list': {'수학 용어': ['설명', '페이지', '단원 번호', '용어'], '수학 단원': ['대단원', '소단원', '학습 목표', '페이지', '단원 번호'], '수학 문제': ['문제 내용', '정답 페이지', '풀이', '페이지', '단원 번호']}, 'shuffle': True}, 'model_args': {'backbone': 'klue/roberta-base', 'tokenizer': 'klue/roberta-base'}, 'train_args': {'version': 'v20', 'masking_token': '[UNK]', 'batch_size': 32, 'learning_rate': 5e-06, 'margin': 0.5, 'epoch': 20, 'early_stopping': 3}, 'gpu': {'type': 'single', 'number': '1'}, 'Desc': '777, [UNK] 활용, GN: 5개, HN: 최대 20개, Dev 데이터 추가, learning_rate 1e-6로 변경'}


In [4]:
torch.cuda.is_available()

True

In [5]:
seed = 777
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(seed)
# seed 결과가달라짐 3~4%

device = "cuda:0" if torch.cuda.is_available() else "cpu"
logger.info(f'device: {device}')
print('device:', device)

2024-04-28 21:31:52 - INFO - device: cuda:0


device: cuda:0


In [6]:
model_id = config['model_args']['backbone']
tokenizer_id = config['model_args']['tokenizer']
model_id, tokenizer_id

('klue/roberta-base', 'klue/roberta-base')

In [7]:
model = RobertaModel.from_pretrained(model_id)#"klue/roberta-base")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)#"klue/roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
print('-'*10)
print('Data Loading Start!!')
logger.info(f'Data Loading Start!!')
print('-'*10)


2024-04-28 21:31:55 - INFO - Data Loading Start!!


----------
Data Loading Start!!
----------


In [9]:
## dataset class
path = "./data/" + config["data_args"]["path"]
logger.info(f'Base Data: {config["data_args"]["path"]}')

2024-04-28 21:31:55 - INFO - Base Data: generated_questions_1068.csv


In [10]:
train_dataset = CustomDataset(path, tokenizer)

origin_data: (1068, 4)


In [None]:
# eval_data = get_csv_data(config["data_args"]["eval_data_path"])
# mask_eval_data = get_csv_data(config["data_args"]["mask_eval_data_path"])
# logger.info(f'Training Data Amount: {len(train_dataset)}')
# print('Training Data Amount:', len(train_dataset))

In [13]:
print('-'*10)
print('Data Loading Complete!!')
logger.info(f'Data Loading Complete!!')
print('-'*10)

2024-04-28 20:10:45 - INFO - Data Loading Complete!!


----------
Data Loading Complete!!
----------


In [11]:
## data loader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=config["train_args"]["batch_size"], shuffle=config["data_args"]["shuffle"])
## optimizer and loss 
optimizer = AdamW(model.parameters(), lr=config["train_args"]["learning_rate"])
loss_function = CosineEmbeddingLoss(margin = config["train_args"]["margin"])

In [12]:
print('-'*10)
print('Training Start!!')
logger.info(f'Training Start!!')
print('-'*10)

model.to(device)
model.train()
global_acc = 0
es_count = 0 
for epoch in range(config["train_args"]["epoch"]):
    logger.info(f'epoch {epoch} ..., early stop count: {es_count}')
    train_loss = 0
    cnt = 0

    ## Train
    model.train()
    for batch_idx, data in enumerate(tqdm(train_dataloader)):
        query, refer, label = data['query'], data['refer'], data['labels']
        optimizer.zero_grad()
        
        query_embd = model(input_ids=query['input_ids'].to(device), attention_mask=query['attention_mask'].to(device))
        refer_embd = model(input_ids=refer['input_ids'].to(device), attention_mask=refer['attention_mask'].to(device))
        
        loss = loss_function(query_embd, refer_embd, label.to(device))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        cnt += 1
    print(f'epoch: {epoch}, train_loss: {train_loss/cnt}')
    logger.info(f'epoch: {epoch}, train_loss: {train_loss/cnt}')
    model.save_pretrained(f'../files/roberta_trained_weight_{config["train_args"]["version"]}/{epoch}/')
        
print('-'*10)
print('Training Complete!!')
print('-'*10)

2024-04-28 21:32:17 - INFO - Training Start!!


----------
Training Start!!
----------


2024-04-28 21:32:18 - INFO - epoch 0 ..., early stop count: 0
  3%|▎         | 23/844 [00:03<02:09,  6.33it/s]


KeyboardInterrupt: 