### Install library

In [1]:
# !pip install transformers[sentencepiece]
# !pip install openpyxl
# !pip install pandas
# !pip install numpy

### Imports

In [2]:
import re
import os
import time
import random
import torch
import numpy as np
import pandas as pd

random.seed(2022)
torch.manual_seed(2022)
np.random.seed(2022)

In [3]:
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

2022-09-12 14:23:19.893678: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-12 14:23:20.065875: I tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: libtpu.so
I0912 14:23:20.181225316  959549 ev_epoll1_linux.cc:121]     grpc epoll fd: 66
D0912 14:23:20.181242334  959549 ev_posix.cc:141]            Using polling engine: epoll1
D0912 14:23:20.181279362  959549 lb_policy_registry.cc:48]   registering LB policy factory for "grpclb"
D0912 14:23:20.181293728  959549 lb_policy_registry.cc:48]   registering LB policy factory for "rls_experimental"
D0912 14:23:20.181299857  959549 lb_policy_registry.cc:48]   registering LB policy factory for "priority_experimental"
D0912 14:23:20



[percpu.cc : 535] RAW: rseq syscall failed with errno 22 after membarrier sycall succeeded.


In [4]:
# using TPU through torch
import torch_xla
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.utils.serialization as xser
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

print(torch_xla.__version__)

1.12


### TPU setting

In [5]:
## .py돌릴 때,
#!export XRT_TPU_CONFIG="localservice;0;localhost:51011"
import os
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"

In [6]:
device = xm.xla_device()

In [7]:
device

device(type='xla', index=1)

### Initialize model

In [8]:
model = AutoModelForMaskedLM.from_pretrained("klue/roberta-large")

In [9]:
klue_roberta_large_parameters = model.num_parameters() / 1_000_000
print(f"'>>>  KLUE_RoBERTa_large number of parameters : {round(klue_roberta_large_parameters)}M'")

'>>>  KLUE_RoBERTa_large number of parameters : 337M'


### Load tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [11]:
model.config

RobertaConfig {
  "_name_or_path": "klue/roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

In [12]:
tokenizer.tokenize

<bound method PreTrainedTokenizerFast.tokenize of PreTrainedTokenizerFast(name_or_path='klue/roberta-large', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

### Data load

In [13]:
data = pd.read_excel("data_processing_re(82681).xlsx")
print(len(data))
data.head()

82681


Unnamed: 0,content,emotion,label
0,아내가 드디어 출산하게 되어서 정말 신이 나.,기쁨,0
1,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야.,긴장,1
2,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워.,긴장,1
3,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야. 너무 행복해.,기쁨,0
4,빚을 드디어 다 갚게 되어서 이제야 안도감이 들어.,평화,2


### Preprocessing and Count by label

In [14]:
data = data.dropna()
data.reset_index(drop=True,inplace = True)

In [15]:
data_pro = data[['content', 'label']]
data_count = data[['content', 'emotion']]

In [16]:
data_count.groupby(by=['emotion']).count()

Unnamed: 0_level_0,content
emotion,Unnamed: 1_level_1
기쁨,11087
긴장,20222
분노,20541
슬픔,23831
중립,4827
평화,2172


In [17]:
data_pro = data_pro.iloc[:20000,:]

In [18]:
len(data_pro)

20000

In [19]:
data_pro.to_csv('data_pro', encoding='utf-8')

### Build the dataset

In [20]:
# default loading option = "utf-8"
block_size = 512        # 256, 384, 512 
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data_pro",
    block_size=block_size
)



### Define the data collator

In [21]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Model to tpu

In [22]:
model = model.to(device)

### Train model

In [23]:
start = time.time()
batch_size = 64        # 8, 16, 32
num_train_epochs = 8
trained_model_path = f"test_mlm/{batch_size}_{block_size}_20000_e8"

os.mkdir(trained_model_path)

'''
TrainingArguments parameters
https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
'''
training_args = TrainingArguments(
    output_dir=trained_model_path,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,                 # total number of training epochs
    per_device_train_batch_size=batch_size,      # batch size per device during training
    save_total_limit=2,
    weight_decay = 0.01,
    tpu_num_cores = 85,
    seed = 2022,
    data_seed = 2022,
    dataloader_pin_memory = True,
    max_steps = 12_500
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()
print("time :", time.time() - start)

max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 20001
  Num Epochs = 40
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 12500


Step,Training Loss
500,1.653
1000,1.45
1500,1.3756
2000,1.295
2500,1.2412
3000,1.1957
3500,1.1489
4000,1.1071
4500,1.0772
5000,1.0352


Saving model checkpoint to test_mlm/64_512_20000_e8/checkpoint-500
Configuration saved in test_mlm/64_512_20000_e8/checkpoint-500/config.json
Model weights saved in test_mlm/64_512_20000_e8/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_mlm/64_512_20000_e8/checkpoint-1000
Configuration saved in test_mlm/64_512_20000_e8/checkpoint-1000/config.json
Model weights saved in test_mlm/64_512_20000_e8/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_mlm/64_512_20000_e8/checkpoint-1500
Configuration saved in test_mlm/64_512_20000_e8/checkpoint-1500/config.json
Model weights saved in test_mlm/64_512_20000_e8/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [test_mlm/64_512_20000_e8/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to test_mlm/64_512_20000_e8/checkpoint-2000
Configuration saved in test_mlm/64_512_20000_e8/checkpoint-2000/config.json
Model weights saved in test_mlm/64_512_20000_e8/checkpoint-2000/pytorch_model.bin
De

time : 7890.072662115097


### Save model

In [24]:
trainer.save_model(trained_model_path)

Saving model checkpoint to test_mlm/64_512_20000_e8
Configuration saved in test_mlm/64_512_20000_e8/config.json
Model weights saved in test_mlm/64_512_20000_e8/pytorch_model.bin


#### The impact of Block size

| Block size | Epochs | Batch size | Total step | Loss | Time | Max. Mem |
| - | - | - | - | - | - | - |
| 256 | 2 | 16 | 10336 | 1.837 | 2h | 72G |
| 384 | 2 | 16 | 10336 | 1.837 | 2h | 72G |
| 512 | 2 | 16 | 10336 | 1.837 | 2h | 72G |

#### The impact of Batch size

| Block size | Epochs | Batch size | Total step | Loss | Time | Max. MeM |
| - | - | - | - | - | - | - |
| 512 | 2 | 8 | 20672 | 1.8864  | 3h 15m | 67G |
| 512 | 2 | 16 | 10336 | 1.837 | 2h | 72G |
| 512 | 4 | 32 | 10336 | 1.6349 | 2h | 76G |
| 512 | 8 | 32 | 12500 | 1.6068 | 2h 20m | 78G |
| 512 | 8 | 64 | 10336 | | | hbm explo |

#### The impact of Data size

| Data size | Epochs | Batch size | Total step | Loss | Time | Max. MeM |
| - | - | - | - | - | - | - |
| 10000 | 8 | 64 | 12500 | 0.4174  | 2h 7m | 53G |
| 20000 | 8 | 64 | 12500 | 0.7297  | 2h 20m | 54G  |
|  |  |  |  |  |  |  |
|  |  |  |  |  | |  |
|  |  |  |  | | |   |