In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from utils import preprocess_text
from models.transformer import Transformer
from data.collate_fn import collate_fn
from train.lr_scheduler import NoamLR
from train.loss import loss_function
from train.train import train_loop

import torch
from torch.optim import Adam

from data.dataset import AihubTranslationDataset
from torch.utils.data import DataLoader

2.4.0


  from .autonotebook import tqdm as notebook_tqdm


#### 데이터 전처리

In [3]:
# Dataset
train_ds = AihubTranslationDataset(
    csv_path="ai_hub_dataset/train_filtered.csv",
    preprocess_fn=preprocess_text,
    max_len=60,
    add_special_tokens=True,
)

In [13]:
print(train_ds[1972]["src_ids"])

tensor([    2,  9283,  2470,  3611,  2052,  6263,  2138,  5340,  9253,  9790,
         2073,  3794,  2069,  1889,  2259,  4411,  2119, 15136,  2062,    18,
            3])


In [15]:
from transformers import AutoTokenizer

src_tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

print(src_tokenizer.decode(train_ds[1972]["src_ids"]))

[CLS] 현명한 사람이 사기를 당하거나 어리석은 결정을 하는 사례도 흔하다. [SEP]


#### 모델 학습

In [5]:
SRC_VOCAB_SIZE = 32000  # "klue/bert-base" tokenizer vocab_size
TGT_VOCAB_SIZE = 30522  # "bert-base-uncased" tokenizer vocab_size
EPOCHS = 8
BATCH_SIZE = 128
D_MODEL = 256
D_FF = 1024

In [6]:
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True,
    num_workers=4,
    collate_fn=collate_fn,
)

In [7]:
# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 생성 및 디바이스 이동
model = Transformer(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    src_len=60,  # 인코더 입력 길이 제한
    tgt_len=60,  # 디코더 입력 길이 제한
    d_model=D_MODEL,
    d_ff=D_FF,
    n_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dropout=0.3,
).to(device)

In [8]:
# optimizer & lr scheduler 정의
optimizer = Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9)
scheduler = NoamLR(optimizer, d_model=D_MODEL, warmup_steps=4000)

In [9]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_loader, model, loss_function, optimizer, scheduler, device)
    torch.save(model.state_dict(), "checkpoints/aihub-ko2en-transformer.pt")

Epoch 1
-------------------------------


 17%|█▋        | 1001/5782 [31:39<2:43:50,  2.06s/it]

[train] epoch_end | loss 6.0566 | avg 7.3803


 35%|███▍      | 2001/5782 [1:02:40<2:04:07,  1.97s/it]

[train] epoch_end | loss 5.1696 | avg 6.5086


 52%|█████▏    | 3001/5782 [1:33:58<1:33:22,  2.01s/it]

[train] epoch_end | loss 4.8742 | avg 6.0248


 69%|██████▉   | 4001/5782 [2:04:49<1:02:38,  2.11s/it]

[train] epoch_end | loss 4.5762 | avg 5.6921


 86%|████████▋ | 5000/5782 [2:35:48<13:42,  1.05s/it]  

[train] epoch_end | loss 4.2437 | avg 5.4402


100%|██████████| 5782/5782 [3:00:17<00:00,  1.87s/it]


Epoch 2
-------------------------------


 17%|█▋        | 1001/5782 [31:01<2:39:50,  2.01s/it]

[train] epoch_end | loss 4.0457 | avg 4.0883


 35%|███▍      | 2001/5782 [1:01:55<2:07:54,  2.03s/it]

[train] epoch_end | loss 4.0008 | avg 4.0373


 52%|█████▏    | 3001/5782 [1:32:50<1:34:25,  2.04s/it]

[train] epoch_end | loss 4.0024 | avg 3.9915


 69%|██████▉   | 4001/5782 [2:03:52<59:25,  2.00s/it]  

[train] epoch_end | loss 3.8116 | avg 3.9498


 86%|████████▋ | 5000/5782 [2:34:13<13:33,  1.04s/it]  

[train] epoch_end | loss 3.6690 | avg 3.9129


100%|██████████| 5782/5782 [2:57:48<00:00,  1.85s/it]


Epoch 3
-------------------------------


 17%|█▋        | 1000/5782 [55:50<1:23:20,  1.05s/it]

[train] epoch_end | loss 3.7140 | avg 3.6355


 35%|███▍      | 2000/5782 [1:26:27<1:05:23,  1.04s/it]

[train] epoch_end | loss 3.5393 | avg 3.6201


 52%|█████▏    | 3000/5782 [1:56:58<48:45,  1.05s/it]  

[train] epoch_end | loss 3.6114 | avg 3.6024


 69%|██████▉   | 4000/5782 [2:27:24<31:21,  1.06s/it]  

[train] epoch_end | loss 3.5696 | avg 3.5871


 86%|████████▋ | 5000/5782 [2:57:49<17:30,  1.34s/it]  

[train] epoch_end | loss 3.5046 | avg 3.5719


100%|██████████| 5782/5782 [3:21:38<00:00,  2.09s/it]


Epoch 4
-------------------------------


 17%|█▋        | 1001/5782 [30:32<1:23:28,  1.05s/it]

[train] epoch_end | loss 3.4267 | avg 3.4248


 35%|███▍      | 2001/5782 [1:01:02<1:31:42,  1.46s/it]

[train] epoch_end | loss 3.5282 | avg 3.4176


 52%|█████▏    | 3001/5782 [1:31:25<1:03:23,  1.37s/it]

[train] epoch_end | loss 3.2737 | avg 3.4111


 69%|██████▉   | 4001/5782 [2:02:02<34:36,  1.17s/it]  

[train] epoch_end | loss 3.4556 | avg 3.4045


 86%|████████▋ | 5001/5782 [2:32:45<13:46,  1.06s/it]  

[train] epoch_end | loss 3.4027 | avg 3.3960


100%|██████████| 5782/5782 [2:56:53<00:00,  1.84s/it]


Epoch 5
-------------------------------


 17%|█▋        | 1001/5782 [30:54<2:32:11,  1.91s/it]

[train] epoch_end | loss 3.1943 | avg 3.2918


 35%|███▍      | 2000/5782 [1:01:48<2:40:43,  2.55s/it]

[train] epoch_end | loss 3.3027 | avg 3.2922


 52%|█████▏    | 3001/5782 [1:32:29<1:34:53,  2.05s/it]

[train] epoch_end | loss 3.2898 | avg 3.2891


 69%|██████▉   | 4001/5782 [2:03:03<58:24,  1.97s/it]  

[train] epoch_end | loss 3.2992 | avg 3.2848


 86%|████████▋ | 5001/5782 [2:33:35<18:24,  1.41s/it]  

[train] epoch_end | loss 3.2744 | avg 3.2802


100%|██████████| 5782/5782 [2:57:44<00:00,  1.84s/it]


Epoch 6
-------------------------------


 17%|█▋        | 1001/5782 [31:23<1:55:57,  1.46s/it]

[train] epoch_end | loss 3.1899 | avg 3.1991


 35%|███▍      | 2000/5782 [1:02:02<1:12:58,  1.16s/it]

[train] epoch_end | loss 3.2133 | avg 3.2009


 42%|████▏     | 2448/5782 [1:16:02<1:43:33,  1.86s/it]


KeyboardInterrupt: 

#### 번역문 생성

In [26]:
from inference import get_bleu_score
import pandas as pd

In [27]:
target_df = pd.read_csv("ai_hub_dataset/test_filtered.csv")["번역문"]

In [28]:
print(get_bleu_score(target_df, target_df))

0.9999997417575798
