### Install library

In [1]:
# !pip install transformers[sentencepiece]
# !pip install torch/xla
# !pip install pandas
# !pip install numpy
# !pip install torch
# !pip install -q soynlp emoji==1.7.0

### Imports

In [1]:
import re
import os
import time
import random
import torch
import numpy as np
import pandas as pd
import emoji
from soynlp.normalizer import repeat_normalize
print(torch.__version__)
print(emoji.__version__)

random.seed(2022)
torch.manual_seed(2022)
np.random.seed(2022)

1.10.0+cu102
1.7.0


In [2]:
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
# from transformers import TrainingArguments
# from pytorch_lightning import Trainer

2022-09-14 02:12:50.430391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-14 02:12:50.614575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/lib
2022-09-14 02:12:50.614599: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-14 02:12:50.648708: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-14 02:12:51.508328

In [3]:
# using TPU through torch
import torch_xla
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.utils.serialization as xser
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

print(torch_xla.__version__)

1.10


### TPU setting

In [4]:
## .py돌릴 때,
#!export XRT_TPU_CONFIG="localservice;0;localhost:51011"
import os
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"

In [5]:
device = xm.xla_device()

2022-09-14 02:12:53.743677: E tensorflow/core/framework/op_kernel.cc:1676] OpKernel ('op: "TPURoundRobin" device_type: "CPU"') for unknown op: TPURoundRobin
2022-09-14 02:12:53.743729: E tensorflow/core/framework/op_kernel.cc:1676] OpKernel ('op: "TpuHandleToProtoKey" device_type: "CPU"') for unknown op: TpuHandleToProtoKey


In [6]:
device

device(type='xla', index=1)

In [7]:
xm.xla_real_devices([str(device)])[0]

'TPU:0'

### Initialize model

In [8]:
model = AutoModelForMaskedLM.from_pretrained("klue/roberta-large")

In [9]:
klue_roberta_large_parameters = model.num_parameters() / 1_000_000
print(f"'>>>  KLUE_RoBERTa_large number of parameters : {round(klue_roberta_large_parameters)}M'")

'>>>  KLUE_RoBERTa_large number of parameters : 337M'


### Load tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [11]:
model.config

RobertaConfig {
  "_name_or_path": "klue/roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

In [12]:
tokenizer.tokenize

<bound method PreTrainedTokenizerFast.tokenize of PreTrainedTokenizerFast(name_or_path='klue/roberta-large', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

### normal Langauge Data load

In [6]:
conv = pd.read_csv('conversation_data.csv',encoding='utf-8')
conv

Unnamed: 0,text
0,이름만 빼고 다잠옷이다
1,중간에 신의 판결이 더 지니어 퍽퍽
2,이름 목소리가 좀 달라진건 기분탖
3,중간에 평창 말할때 평창올림픽 하는줄 영미퍼버버버버벅
4,씨그마가 시그널로
...,...
604648,저런거 먹으면 귀여워지나
604649,내까까 귀염
604650,나도 비속어 좋아하는데
604651,저절로 웃음이 나옴 넘 귀여워


In [7]:
def preprocess(df): #전처리 코드
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    
    x_list = []
    for x in conv.text:
        x = str(x)
        x = pattern.sub(' ', x)
        x = url_pattern.sub('', x)
        x = x.strip()
        x = repeat_normalize(x, num_repeats=2) # ㅋㅋㅋㅋㅋㅋ .. -> ㅋㅋ 같이 반복되는 문자에 대해 변환 
        x_list.append(x)
    return x_list
pre_conv = preprocess(conv)

In [8]:
result = []
for i in pre_conv:
    if len(i) <= 200:
        result.append(i)

In [9]:
len(result)

592017

In [10]:
res = pd.DataFrame(result)
res.columns = ['text']
res.to_csv('conv.csv',index=False,encoding='utf-8')

### Build the dataset

In [24]:
# default loading option = "utf-8"
block_size = 512        # 256, 384, 512 
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='conv.csv',
    block_size=block_size
)



In [25]:
dataset

<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x7f5fd94a8700>

### Define the data collator

In [26]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Model to tpu

In [27]:
model = model.to(device)

### Train model

In [28]:
start = time.time()
batch_size = 16        # 8, 16, 32
num_train_epochs = 8
trained_model_path = f"test_mlm/{batch_size}_{block_size}_20000_e8"

# os.mkdir(trained_model_path)

'''
TrainingArguments parameters
https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
'''
training_args = TrainingArguments(
    output_dir=trained_model_path,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,                 # total number of training epochs
    per_device_train_batch_size=batch_size,      # batch size per device during training
    save_total_limit=2,
    weight_decay = 0.01,
    tpu_num_cores = 85,
    seed = 2022,
    data_seed = 2022,
    dataloader_pin_memory = True,
    max_steps = 12_500
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()
print("time :", time.time() - start)

max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 592018
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12500


Step,Training Loss
500,4.1149
1000,3.7927
1500,3.7089
2000,3.6701
2500,3.6549
3000,3.5641
3500,3.579
4000,3.6044
4500,3.4095
5000,3.4127


Saving model checkpoint to test_mlm/16_512_20000_e8/checkpoint-500
Configuration saved in test_mlm/16_512_20000_e8/checkpoint-500/config.json
Model weights saved in test_mlm/16_512_20000_e8/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_mlm/16_512_20000_e8/checkpoint-1000
Configuration saved in test_mlm/16_512_20000_e8/checkpoint-1000/config.json
Model weights saved in test_mlm/16_512_20000_e8/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_mlm/16_512_20000_e8/checkpoint-1500
Configuration saved in test_mlm/16_512_20000_e8/checkpoint-1500/config.json
Model weights saved in test_mlm/16_512_20000_e8/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [test_mlm/16_512_20000_e8/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to test_mlm/16_512_20000_e8/checkpoint-2000
Configuration saved in test_mlm/16_512_20000_e8/checkpoint-2000/config.json
Model weights saved in test_mlm/16_512_20000_e8/checkpoint-2000/pytorch_model.bin
De

time : 14824.021312475204


### Save model

In [29]:
trainer.save_model(trained_model_path)

Saving model checkpoint to test_mlm/16_512_20000_e8
Configuration saved in test_mlm/16_512_20000_e8/config.json
Model weights saved in test_mlm/16_512_20000_e8/pytorch_model.bin


#### The impact of Block size

| Block size | Epochs | Batch size | Total step | Loss | Time |
| - | - | - | - | - | - |
| 256 | 2 | 16 | 10336 | 2.? | |
| 384 |  | | | | |
| 512 |  | | | | |

#### The impact of Batch size

| Block size | Epochs | Batch size | Total step | Loss | Time |
| - | - | - | - | - | - |
| 512 |  | 8 |  |  | |
| 512 |  | 16 | | |  |
| 512 |  | 32 | | | |

In [13]:
## repo
MODEL_SAVE_REPO = 'klue_batch16_block512_con200' # ex) 'my-bert-fine-tuned'
HUGGINGFACE_AUTO_TOKEN = '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' # https://huggingface.co/settings/token
 
## Push to huggingface-hub
model.push_to_hub(
			'test_mlm/16_512_20000_e8/checkpoint-12500', 
			use_temp_dir=True, 
			use_auth_token=HUGGINGFACE_AUTO_TOKEN
)
tokenizer.push_to_hub(
			'test_mlm/16_512_20000_e8/checkpoint-12500', 
			use_temp_dir=True, 
			use_auth_token=HUGGINGFACE_AUTO_TOKEN
)

Cloning https://huggingface.co/qlqqqk/checkpoint-12500 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/1.25G [00:00<?, ?B/s]