In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import preprocess_text
import random
import tqdm
from sklearn.model_selection import train_test_split

from models.transformer import Transformer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


#### 데이터 전처리

In [2]:
train_data_dir = "data/train_filtered.csv"

train_df = pd.read_csv(train_data_dir)
train_df.sample(5)

Unnamed: 0,원문,번역문
390804,극중에서 박중훈이 부른 노래 ‘비와 당신’은 음악 예능 프로그램 등에서 다시 불리며...,"The song ""Rain and You"" sung by Park Joong-hoo..."
388723,음악을 들으면 들을수록 묘하게 빠져드는 리듬과 가사들이 평소 아이돌 노래에 관심도 ...,The rhythms and lyrics that fall strangely as ...
289681,매 작품마다 시청자들을 몰입시키는 강한 흡입력을 가진 임수향이 보여줄 드라마 ‘내 ...,"Expectations are high the drama ""My ID is Gang..."
434982,과목별로 포트폴리오 활동과 다양한 논·서술형 등을 포함하고 있으며 수행평가 1개당 ...,It includes portfolio activities and various t...
321428,당초 하남 감북지구가 유력하게 꼽혔으나 교산지구로 최종 낙점됐다.,"Originally, Hanam Gambuk District was prominen..."


In [3]:
print(len(train_df))

740073


In [4]:
processed_ko_text, processed_en_text = [], []

for idx in range(len(train_df)):
    processed_ko_text.append(preprocess_text(train_df["원문"][idx], lang="ko"))
    processed_en_text.append(preprocess_text(train_df["번역문"][idx], lang="en"))

In [6]:
ko_tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
en_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print(ko_tokenizer.vocab_size)
print(en_tokenizer.vocab_size)

32000
30522


In [7]:
tokenized_ko_text, tokenized_en_text = [], []

for i in range(len(train_df)):
    ko_tokens = ko_tokenizer.tokenize(processed_ko_text[i])
    ko_seq = ko_tokenizer(processed_ko_text[i])
    tokenized_ko_text.append(ko_seq["input_ids"])

    en_tokens = en_tokenizer.tokenize(processed_en_text[i])
    en_seq = en_tokenizer(processed_en_text[i])
    tokenized_en_text.append(en_seq["input_ids"])

In [8]:
import torch
from torch.nn.utils.rnn import pad_sequence

PAD_ID = 0  # 토크나이저 pad_token_id

# tokenized_en_text: [[101, 2003, 102], [101, 1234, 5678, 102], ...]

enc_train = pad_sequence(
    [torch.tensor(seq) for seq in tokenized_ko_text],
    batch_first=True,
    padding_value=PAD_ID,
)
dec_train = pad_sequence(
    [torch.tensor(seq) for seq in tokenized_en_text],
    batch_first=True,
    padding_value=PAD_ID,
)

In [9]:
print(enc_train.shape, dec_train.shape)

torch.Size([740073, 60]) torch.Size([740073, 60])


In [10]:
print(enc_train[1121])
print(dec_train[1121])

tensor([    2,  3693,  2052,  3666, 17378,  2170,  7258,  2138,  6274,  2067,
         3739, 17378,  3816,  2116, 10315,  2138,  1122,  2069,   575,  2052,
         2241,  7450,  2052,  4763,  2205,  2259,   575,  6233,  7483,  2897,
         2062,    18,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
tensor([  101,  2009,  2003, 10009,  2008,  2065,  2859, 17607,  2015, 26269,
         2006,  1057,  1012,  1055,  1012,  7975,  2009,  2097,  5770,  4759,
         7975,  3316,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


#### 모델 학습

In [None]:
# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 생성 및 디바이스 이동
model = Transformer(
    src_vocab_size=ko_tokenizer.vocab_size,
    tgt_vocab_size=en_tokenizer.vocab_size,
    src_len=60,  # 인코더 입력 길이
    tgt_len=60,  # 디코더 입력 길이
    d_model=512,
    d_ff=2048,
    n_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dropout=0.3,
).to(device)