In [1]:
import os
import sys
import json
import getopt

import torch
import random
import numpy as np
import torch.backends.cudnn as cudnn

from tqdm import tqdm
from collections import defaultdict

from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.nn import CosineEmbeddingLoss

from transformers import AutoTokenizer
from datasets.dataset import CustomDataset, get_csv_data
from sklearn.metrics.pairwise import cosine_similarity

from models.roberta_encoder import RobertaModel
from logger.mylogger import set_logger
import wandb  # Import wandb
import datetime

In [2]:
with open('configs/sample.json', 'r') as f:
    config = json.load(f)
print(config)

{'data_args': {'path': 'generated_questions_single_q_test.csv', 'eval_data_path': 'generated_questions_single_q_train.csv', 'mask_eval_data_path': '../test_data/unk_mrn_test.csv', 'eval_att_list': {'수학 용어': ['설명', '페이지', '단원 번호', '용어'], '수학 단원': ['대단원', '소단원', '학습 목표', '페이지', '단원 번호'], '수학 문제': ['문제 내용', '정답 페이지', '풀이', '페이지', '단원 번호']}, 'shuffle': True}, 'model_args': {'backbone': 'klue/roberta-base', 'tokenizer': 'klue/roberta-base'}, 'train_args': {'version': 'v20', 'masking_token': '[UNK]', 'batch_size': 32, 'learning_rate': 5e-06, 'margin': 0.5, 'epoch': 20, 'early_stopping': 3}, 'gpu': {'type': 'single', 'number': '1'}, 'Desc': '777, [UNK] 활용, GN: 5개, HN: 최대 20개, Dev 데이터 추가, learning_rate 1e-6로 변경'}


In [None]:
# Initialize wandb
wandb.init(project="roberta_training", entity=os.getenv('WANDB_ENTITY'), name=f'roberta_training_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")}')

In [4]:
logger = set_logger(config["data_args"]["path"], config["train_args"]["version"])
logger.info(f"config content: {config}")

2024-04-29 23:16:48 - INFO - config content: {'data_args': {'path': 'generated_questions_single_q_test.csv', 'eval_data_path': 'generated_questions_single_q_train.csv', 'mask_eval_data_path': '../test_data/unk_mrn_test.csv', 'eval_att_list': {'수학 용어': ['설명', '페이지', '단원 번호', '용어'], '수학 단원': ['대단원', '소단원', '학습 목표', '페이지', '단원 번호'], '수학 문제': ['문제 내용', '정답 페이지', '풀이', '페이지', '단원 번호']}, 'shuffle': True}, 'model_args': {'backbone': 'klue/roberta-base', 'tokenizer': 'klue/roberta-base'}, 'train_args': {'version': 'v20', 'masking_token': '[UNK]', 'batch_size': 32, 'learning_rate': 5e-06, 'margin': 0.5, 'epoch': 20, 'early_stopping': 3}, 'gpu': {'type': 'single', 'number': '1'}, 'Desc': '777, [UNK] 활용, GN: 5개, HN: 최대 20개, Dev 데이터 추가, learning_rate 1e-6로 변경'}


In [5]:
torch.cuda.is_available()

True

In [6]:
seed = 777
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(seed)
# seed 결과가달라짐 3~4%

device = "cuda:0" if torch.cuda.is_available() else "cpu"
logger.info(f'device: {device}')
print('device:', device)

2024-04-29 23:16:48 - INFO - device: cuda:0


device: cuda:0


In [7]:
model_id = config['model_args']['backbone']
tokenizer_id = config['model_args']['tokenizer']
model_id, tokenizer_id

('klue/roberta-base', 'klue/roberta-base')

In [8]:
model = RobertaModel.from_pretrained(model_id)#"klue/roberta-base")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)#"klue/roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print('-'*10)
print('Data Loading Start!!')
logger.info(f'Data Loading Start!!')
print('-'*10)
## dataset class
path = "./data/" + config["data_args"]["path"]
logger.info(f'Base Data: {config["data_args"]["path"]}')
train_dataset = CustomDataset(path, tokenizer)
print('-'*10)
print('Data Loading Complete!!')
logger.info(f'Data Loading Complete!!')
print('-'*10)

2024-04-29 23:16:49 - INFO - Data Loading Start!!
2024-04-29 23:16:49 - INFO - Base Data: generated_questions_single_q_test.csv


----------
Data Loading Start!!
----------
origin_data: (2189, 6)


2024-04-29 23:17:08 - INFO - Data Loading Complete!!


----------
Data Loading Complete!!
----------


In [10]:
## data loader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=config["train_args"]["batch_size"], shuffle=config["data_args"]["shuffle"])
## optimizer and loss 
optimizer = AdamW(model.parameters(), lr=config["train_args"]["learning_rate"])
loss_function = CosineEmbeddingLoss(margin = config["train_args"]["margin"])

In [11]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 3
            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                                Default: 0
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_loss):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.min_delta:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
early_stopping = EarlyStopping(patience=config["train_args"]["early_stopping"], min_delta=0.01)

In [12]:
def compute_validation_loss(val_dataloader, model, device, loss_function):
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0
    count = 0

    with torch.no_grad():  # No need to track gradients for validation
        for batch in val_dataloader:
            query, refer, label = batch['query'], batch['refer'], batch['labels']

            # Move the input and target to the respective device
            query_embd = model(input_ids=query['input_ids'].to(device), attention_mask=query['attention_mask'].to(device))
            refer_embd = model(input_ids=refer['input_ids'].to(device), attention_mask=refer['attention_mask'].to(device))

            # Calculate loss
            loss = loss_function(query_embd, refer_embd, label.to(device))
            total_val_loss += loss.item()
            count += 1

    average_val_loss = total_val_loss / count if count != 0 else 0
    return average_val_loss


In [13]:
path = "./data/" + config["data_args"]["eval_data_path"]    
# Assuming you have a method or path to get your validation data similar to the training data
val_dataset = CustomDataset(path, tokenizer)  # Make sure this path is correct and data is processed similarly
val_dataloader = DataLoader(val_dataset, batch_size=config["train_args"]["batch_size"], shuffle=False)  # Usually no need to shuffle validation data

origin_data: (677, 6)


In [20]:
print('-'*10)
print('Training Start!!')
logger.info(f'Training Start!!')
print('-'*10)

model.to(device)
model.train()
global_acc = 0
es_count = 0 
for epoch in range(config["train_args"]["epoch"]):
    train_loss = 0
    cnt = 0
    for batch_idx, data in enumerate(tqdm(train_dataloader)):
        query, refer, label = data['query'], data['refer'], data['labels']
        optimizer.zero_grad()
        
        query_embd = model(input_ids=query['input_ids'].to(device), attention_mask=query['attention_mask'].to(device))
        refer_embd = model(input_ids=refer['input_ids'].to(device), attention_mask=refer['attention_mask'].to(device))
        
        loss = loss_function(query_embd, refer_embd, label.to(device))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        cnt += 1

    # Log training loss
    average_train_loss = train_loss / cnt
    wandb.log({"train_loss": average_train_loss})
    print(f'epoch: {epoch}, train_loss: {average_train_loss}')
    logger.info(f'epoch: {epoch}, train_loss: {average_train_loss}')

    # Compute validation loss after each training epoch
    validation_loss = compute_validation_loss(val_dataloader, model, device, loss_function)
    wandb.log({"val_loss": validation_loss})
    print(f'epoch: {epoch}, validation_loss: {validation_loss}')
    logger.info(f'epoch: {epoch}, validation_loss: {validation_loss}')
    
    # Early stopping based on validation loss
    early_stopping(validation_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered")
        logger.info("Early stopping triggered")
        break

    # Save the model if it's the best so far
    if early_stopping.best_score == -validation_loss:
        model.save_pretrained(f'files/roberta_trained_weight_{config["train_args"]["version"]}/best_model/')
        wandb.save(f'files/roberta_trained_weight_{config["train_args"]["version"]}/best_model/*')

2024-04-29 23:13:42 - INFO - Training Start!!


----------
Training Start!!
----------


100%|██████████| 844/844 [02:01<00:00,  6.97it/s]
2024-04-29 23:15:43 - INFO - epoch: 0, train_loss: 0.01893337024596489


epoch: 0, train_loss: 0.01893337024596489


NameError: name 'val_dataloader' is not defined

In [None]:
print('Training Complete!!')
wandb.finish()