In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import preprocess_text
import random
import tqdm
from sklearn.model_selection import train_test_split

from models.transformer import Transformer
from data.collate_fn import collate_fn

import torch

from data.dataset import AihubTranslationDataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


#### 데이터 전처리

In [2]:
# Dataset
train_ds = AihubTranslationDataset(
    csv_path="ai_hub_dataset/train_filtered.csv",
    preprocess_fn=preprocess_text,
    max_len=60,
    add_special_tokens=True,
)

In [3]:
print(train_ds[1972]["tgt_ids"])

tensor([  101,  2009,  2003,  2036,  2691,  2005,  1037,  7968,  2158,  2000,
         2022,  8040, 27479,  2030,  2191, 13219,  6567,  1012,   102])


In [4]:
train_loader = DataLoader(
    dataset=train_ds, batch_size=2, shuffle=True, collate_fn=collate_fn
)

In [5]:
for batch in train_loader:
    print(batch)
    break

{'encoder_input_ids': tensor([[ 7102,  2255,  2116,  5419,  2470,  5586,  2259,  3814, 13513,  2121,
         27135,  4306,    26,  2517,  3135,  9742,  2507,  2062,    18,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [14034,  2069,  1343,  2259,  6442, 15351,  1154,  2093,  2138,  1343,
          2227, 26914,  2259,    23,  2134,  2515,  2466,  2115,  2116,  4454,
         31302,  2259,  1902,  3683,  1504,  2031,  4081, 27135,  4477, 13582,
          2069,  4976,  2205,  2259,   842,  2259,  3919,  2119, 15804, 18699,
          2155,  2079,  4480,  2470, 11117,  2052,  4807, 11187,  3677,  2205,
          2062,    18]]), 'decoder_input_ids': tensor([[  101,  1996,  2522,  5428,  3366,  2679,  2001,  1996,  4369,  1997,
          2049,  2785,  2012,  1996, 2

#### 모델 학습

In [13]:
from torch.optim import Adam
from train.lr_scheduler import NoamLR

In [None]:
# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 생성 및 디바이스 이동
model = Transformer(
    src_vocab_size=32000,  # "klue/bert-base" tokenizer vocab_size
    tgt_vocab_size=30522,  # "bert-base-uncased" tokenizer vocab_size
    src_len=60,  # 인코더 입력 길이 제한
    tgt_len=60,  # 디코더 입력 길이 제한
    d_model=512,
    d_ff=2048,
    n_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dropout=0.3,
).to(device)

In [None]:
# optimizer & lr scheduler 정의
optimizer = torch.optim.Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9)

scheduler = NoamLR(optimizer, d_model=512, warmup_steps=4000)