# N2M - 날짜 정규화 모델 학습




**N2M 실습 : 다양한 형태의 날짜 데이터를 입력받아 YYYY-MM-DD 형태로 생성하는 모델**

**실습 개요**
- Transformers Encoder-Decoder 모델을 활용한 Sequence to Sequence 문제 실습

**실습 배경 및 목적**
- N2M 태스크 이해도 향상
- Huggingface를 활용한 최신 Encoder-Decoder 모델 사용
- WandB 사용 방법 학습

**데이터셋(https://github.com/htw5295/Neural_date_translation_dataset)**
- [Faker](https://faker.readthedocs.io/en/master/) 라이브러리(MIT License)로 직접 생성한 날짜 표기 데이터
- 입력 : 다양한 형태의 날짜 표기 데이터
- 출력 : yyyy-mm-dd 형태의 날짜 표기 데이터
- 학습 데이터 : 24,000개
- 검증 데이터 : 3,000개
- 평가 데이터 : 3,000개

**모델**
- [facebook/bart-base](https://huggingface.co/facebook/bart-base)의 Tokenizer와 Config 활용
- Huggingface의 [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForSeq2SeqLM) 모델 활용

**평가**
- 예측 데이터를 yyyy-mm-dd 형식으로 디코딩한 뒤 정답 데이터와 비교하여 일치, 불일치를 판단함

# 라이브러리 설치 및 임포트

In [2]:
# !pip install pytorch-lightning
# !pip install tqdm
# !pip install transformers
# !pip install wandb -qU

In [3]:
from tqdm.auto import tqdm

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

import transformers

import random

import pandas as pd



# W&B 로그인

In [4]:
## 아래 Cell 을 수행하기 전에, wandb 에 반드시 가입이 되어 있어야 합니다.
## https://wandb.ai/authorize 에 있는 key 값을 복사 후 입력
!wandb login 88e24c21ad0b69301576313c09b37ec29bf6a0d0

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/kingstar/.netrc


# 데이터

In [5]:
# https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

    def __len__(self):
        return len(self.data)

In [6]:
# https://pytorch-lightning.readthedocs.io/en/stable/extensions/datamodules.html?highlight=datamodule
class Dataloader(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

        # 페이스북 bart-base 모델의 토크나이저 불러오기
        self.tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/bart-base')

        # github에 업로드된 데이터셋 다운로드
        # index 컬럼이 없으므로 index_col= False
        # 데이터를 토큰화
        self.train_data = self.tokenizing(pd.read_csv('https://raw.githubusercontent.com/htw5295/Neural_date_translation_dataset/main/train.csv', index_col=False))
        self.val_data   = self.tokenizing(pd.read_csv('https://raw.githubusercontent.com/htw5295/Neural_date_translation_dataset/main/val.csv', index_col=False))
        self.test_data  = self.tokenizing(pd.read_csv('https://raw.githubusercontent.com/htw5295/Neural_date_translation_dataset/main/test.csv', index_col=False))

    def tokenizing(self, dataframe):
        tokenized_data = []
        for idx, item in tqdm(dataframe.iterrows(), desc='text tokenizing', total=len(dataframe)):
            # 최대길이 16, truncation -> 최대길이에 맞게 데이터 자르기, padding -> 최대길이에 맞게 패딩토큰 추가하기
            input_date  = self.tokenizer(item['inputs'],  padding='max_length', truncation=True, max_length=16)
            target_date = self.tokenizer(item['targets'], padding='max_length', truncation=True, max_length=16)
            # 텍스트를 숫자로 변환한 input_ids를 리스트 형식으로 저장
            tokenized_data.append([input_date['input_ids'], target_date['input_ids']])

        return tokenized_data

    def setup(self, stage=None):
        if stage == 'fit':
            self.train_dataset = Dataset(self.train_data)
            self.val_dataset   = Dataset(self.val_data)
        else:
            self.test_dataset  = Dataset(self.test_data)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=32)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size)

# 모델

In [7]:
# https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html
class Model(pl.LightningModule):
    def __init__(self, tokenizer):
        super().__init__()
        self.save_hyperparameters()

        self.tokenizer = tokenizer

        # 페이스북 bart-base 모델의 설정값 불러오기
        self.config = transformers.BartConfig.from_pretrained('facebook/bart-base')

        # 불러온 설정값을 토대로 AutoModelForSeq2SeqLM(BART) 모델 생성
        self.encoder_decoder = transformers.AutoModelForSeq2SeqLM.from_config(self.config)

    def forward(self, x, y):
        outputs  = self.encoder_decoder(input_ids=x, labels=y)
        return outputs.loss, outputs.logits

    def training_step(self, batch, batch_idx):
        x, y = batch

        # loss 측정
        loss, logits = self(x, y)
        self.log("train_loss", loss)

        # 훈련/검증 단계 때는 LOSS 등만 확인하지만, 
        # 최종 예측은 beam search 를 통해 수행되기 때문에 
        # 훈련 중간 중간의 결과물을 확인하기 위해서 아래 코드를 사용할 수 있습니다.
        # (일반적으로는 속도가 매우 느려지기 때문에 훈련단계에서는 포함하지 않습니다)

        # Beam search의 N=3으로, 3개의 문장을 생성하고, 가장 좋은 1개의 문장을 받아옴
        pred_ids = self.encoder_decoder.generate(x, num_beams=3, min_length=0, max_length=16, num_return_sequences=1)
        # 토큰 -> 텍스트 변환
        pred     = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        target   = self.tokenizer.batch_decode(y, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        # 예측값과 정답값이 일치하는지 측정
        accuracy = []
        for p, t in zip(pred, target):
            if p == t:
                accuracy.append(1)
            else:
                accuracy.append(0)
        accuracy = sum(accuracy) / len(accuracy)
        self.log("train_acc", accuracy, prog_bar=True, on_step=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch

        # loss 측정
        loss, logits = self(x, y)
        self.log("val_loss", loss)

        # 훈련/검증 단계 때는 LOSS 등만 확인하지만, 
        # 최종 예측은 beam search 를 통해 수행되기 때문에 
        # 훈련 중간 중간의 결과물을 확인하기 위해서 아래 코드를 사용할 수 있습니다.
        # (일반적으로는 속도가 매우 느려지기 때문에 훈련단계에서는 포함하지 않습니다)

        # 예측값과 정답값이 일치하는지 비교하기 위해 예측 토큰 생성
        # Beam search의 N=3으로, 3개의 문장을 생성하고, 가장 좋은 1개의 문장을 받아옴
        pred_ids = self.encoder_decoder.generate(x, num_beams=3, min_length=0, max_length=16, num_return_sequences=1)
        # 토큰 -> 텍스트 변환
        pred = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        target = self.tokenizer.batch_decode(y, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        # 예측값과 정답값이 일치하는지 측정
        accuracy = []
        for p, t in zip(pred, target):
            if p == t:
                accuracy.append(1)
            else:
                accuracy.append(0)
        accuracy = sum(accuracy) / len(accuracy)
        self.log("val_acc", accuracy, prog_bar=True, on_step=True)

        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch

        # 예측값과 정답값이 일치하는지 비교하기 위해 예측 토큰 생성
        # Beam search의 N=3으로, 3개의 문장을 생성하고, 가장 좋은 1개의 문장을 받아옴
        pred_ids = self.encoder_decoder.generate(x, num_beams=3, min_length=0, max_length=16, num_return_sequences=1)
        # 토큰 -> 텍스트 변환
        pred = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        target = self.tokenizer.batch_decode(y, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        # 예측값과 정답값이 일치하는지 측정
        accuracy = []
        for p, t in zip(pred, target):
            if p == t:
                accuracy.append(1)
            else:
                accuracy.append(0)
        accuracy = sum(accuracy) / len(accuracy)
        self.log("test_acc", accuracy, prog_bar=True)

        return accuracy

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001)
        return optimizer

# 학습 및 평가

In [8]:
batch_size = 128
max_epoch = 1

# W&B 로그 설정, 생성한 프로젝트 이름 입력
# https://wandb.ai/{유저닉네임}/{프로젝트이름} 에서 확인가능
# You want to use `wandb` logger which is not installed yet, install it with `pip install wandb`. 에러가 발생하면 런타임 다시시작을 해주세요
wandb_logger = WandbLogger(project="date")

dataloader = Dataloader(batch_size)
model = Model(dataloader.tokenizer)

# 학습한 모델의 저장을 위한 코드입니다.
# 학습 종료 후"model" 폴더에 저장된 모델이 6강 Prediction service에서 활용됩니다!
checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath="model", save_top_k=1, monitor="val_loss")
# https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=max_epoch, logger=wandb_logger, callbacks=[checkpoint_callback], log_every_n_steps=1)
trainer.fit(model=model, datamodule=dataloader)

trainer.test(model=model, datamodule=dataloader) # <- test set 에 대한 평가 진행됨

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtraintogpb[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

text tokenizing:   0%|          | 0/24000 [00:00<?, ?it/s]

text tokenizing:   0%|          | 0/3000 [00:00<?, ?it/s]

text tokenizing:   0%|          | 0/3000 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type                         | Params
-----------------------------------------------------------------
0 | encoder_decoder | BartForConditionalGeneration | 139 M 
-----------------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
2023-04-11 14:45:35.640969: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.9940000176429749
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc': 0.9940000176429749}]

# WandB

src에 자신의 wandb 프로젝트 링크를 입력해주세요.
찾는 법은 아래와 같습니다.

1. wandb 홈페이지에 로그인 한다.
2. 왼쪽 상단의 wandb logo 를 클린하다.
3. 화면에 현재 wandb에서 실험되고 있는 각 실험이 'Runs' 아래에 보인다. 
4. 내가 보고 싶은 실험(아마도 가장 상단의 것) id 를 클릭한다.
5. 보이는 실험 페이지의 URL 을 복사한다.
4. 아래의 iframe src="URL_을_여기에_복사_한다" ... 


In [10]:
%%html
<iframe src="https://wandb.ai/traintogpb/date/runs/wpvazdst?workspace=user-traintogpb" width="900" height="1000"></iframe>

# 테스트

In [11]:
# 생성 테스트
x = 'May 18, 2022'

model    = model.cpu()
inputs   = model.tokenizer(x, padding='max_length', truncation=True, max_length=16, return_tensors='pt')
pred_ids = model.encoder_decoder.generate(inputs['input_ids'], num_beams=3, min_length=0, max_length=16, num_return_sequences=3)
pred     = model.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

# 상위 3개의 결과
print(pred)

['2022-06-18', '2022-07-18', '2022-12-18']


###**콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다.

