# DAPT 가사 - 일상어 - 감성어

### Install library

### Imports

In [3]:
import re
import os
import time
import random
import torch
import numpy as np
import pandas as pd

random.seed(2022)
torch.manual_seed(2022)
np.random.seed(2022)

In [4]:
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

2022-09-17 08:50:45.551250: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-17 08:50:45.795834: I tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: libtpu.so
I0917 08:50:45.899345334 1419266 ev_epoll1_linux.cc:121]     grpc epoll fd: 61
D0917 08:50:45.899362416 1419266 ev_posix.cc:141]            Using polling engine: epoll1
D0917 08:50:45.899392480 1419266 lb_policy_registry.cc:48]   registering LB policy factory for "grpclb"
D0917 08:50:45.899400610 1419266 lb_policy_registry.cc:48]   registering LB policy factory for "rls_experimental"
D0917 08:50:45.899408147 1419266 lb_policy_registry.cc:48]   registering LB policy factory for "priority_experimental"
D0917 08:50:45



[percpu.cc : 535] RAW: rseq syscall failed with errno 22 after membarrier sycall succeeded.


In [5]:
# using TPU through torch
import torch_xla
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.utils.serialization as xser
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

print(torch_xla.__version__)

1.12


### TPU setting

In [6]:
## Google cloud project에서 TPU 셋팅

# .py로 실행할 때 TPU 셋팅 명령어
#!export XRT_TPU_CONFIG="localservice;0;localhost:51011"

# 주피터 노트 또는 주피터 랩에서 실행할 때, TPU 셋팅 명령어
import os
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"

In [7]:
device = xm.xla_device()

In [8]:
device

device(type='xla', index=1)

### Initialize model

In [9]:
# huggingface에서 사전에 가사-일상어 DAPT 완료된 모델 load
model = AutoModelForMaskedLM.from_pretrained("JUNEYEOB/DAPT_batch512_lyric_con")

tcmalloc: large alloc 1855389696 bytes == 0x93532000 @  0x7fbf60140680 0x7fbf60161824 0x7fbf60161b8a 0x7fbf041e432e 0x7fbf041cfda2 0x7fbf37593451 0x7fbf4dcb0409 0x7fbf4d9598d5 0x5f6929 0x5f74f6 0x50c383 0x570b26 0x569dba 0x5f6eb3 0x5f6082 0x56d2d5 0x569dba 0x5f6eb3 0x56cc1f 0x5f6cd6 0x56bacd 0x569dba 0x5f6eb3 0x50bc2c 0x5f6082 0x56d2d5 0x569dba 0x50bca0 0x570b26 0x569dba 0x6902a7
tcmalloc: large alloc 1855389696 bytes == 0x101ea2000 @  0x7fbf60140680 0x7fbf60161824 0x5fb391 0x7fbf4dcb0422 0x7fbf4d9598d5 0x5f6929 0x5f74f6 0x50c383 0x570b26 0x569dba 0x5f6eb3 0x5f6082 0x56d2d5 0x569dba 0x5f6eb3 0x56cc1f 0x5f6cd6 0x56bacd 0x569dba 0x5f6eb3 0x50bc2c 0x5f6082 0x56d2d5 0x569dba 0x50bca0 0x570b26 0x569dba 0x6902a7 0x6023c4 0x5c6730 0x56bacd


In [10]:
klue_roberta_large_parameters = model.num_parameters() / 1_000_000
print(f"'>>>  KLUE_RoBERTa_large number of parameters : {round(klue_roberta_large_parameters)}M'")

# freeze layers
for param in model.roberta.parameters():
    param.requires_grad = False

'>>>  KLUE_RoBERTa_large number of parameters : 337M'


### Load tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [12]:
model.config

RobertaConfig {
  "_name_or_path": "JUNEYEOB/DAPT_batch512_lyric_con",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

In [13]:
tokenizer.tokenize

<bound method PreTrainedTokenizerFast.tokenize of PreTrainedTokenizerFast(name_or_path='klue/roberta-large', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

### Data load

In [14]:
data = pd.read_excel("sentimental_data.xlsx")
print(len(data))
data.head()

82610


Unnamed: 0,content,emotion,label
0,아내가 드디어 출산하게 되어서 정말 신이 나,기쁨,0
1,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야,긴장,1
2,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워,긴장,1
3,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야 너무 행복해,기쁨,0
4,이제 곧 은퇴할 시기가 되었어 내가 먼저 은퇴를 하고 육 개월 후에 남편도 은퇴를 ...,긴장,1


### Preprocessing and Count by label

In [15]:
data = data.dropna()
data.reset_index(drop=True,inplace = True)

In [16]:
data_pro = data[['content', 'label']]
data_count = data[['content', 'emotion']]

In [17]:
data_count.groupby(by=['emotion']).count()

Unnamed: 0_level_0,content
emotion,Unnamed: 1_level_1
기쁨,10975
긴장,19875
분노,20926
슬픔,23725
중립,5020
평화,2089


In [18]:
len(data_pro)

82610

### Save with utf-8

In [19]:
data_pro.to_csv('data_pro', encoding='utf-8')

### Load dataset

In [20]:
len(pd.read_csv('data_pro', sep= ',', index_col = 0))

82610

### Build the dataset

In [21]:
# default loading option = "utf-8"
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data_pro",
    block_size=512
)



### Define the data collator

In [22]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Model to tpu

In [27]:
model = model.to(device)

### Train model

In [28]:
start = time.time()
batch_size = 512
num_train_epochs = 200
trained_model_path = f"test_mlm/DAPT_batch{batch_size}_lyric_con_sent"

#os.mkdir(trained_model_path)

os.environ["WANDB_DISABLED"] = "true"

'''
TrainingArguments parameters
https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
'''
training_args = TrainingArguments(
    output_dir=trained_model_path,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,                 # total number of training epochs
    per_device_train_batch_size=batch_size,      # batch size per device during training
    save_total_limit=2,
    weight_decay = 0.01,
    tpu_num_cores = 85,
    seed = 2022,
    data_seed = 2022,
    dataloader_pin_memory = True,
    max_steps = 12_500,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()
print("time :", time.time() - start)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 82611
  Num Epochs = 78
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 12500


Step,Training Loss
500,2.7911
1000,2.7783
1500,2.778
2000,2.7633
2500,2.7615
3000,2.7536
3500,2.7523
4000,2.7442
4500,2.7376
5000,2.7413


Saving model checkpoint to test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-500
Configuration saved in test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-500/config.json
Model weights saved in test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-8500] due to args.save_total_limit
Saving model checkpoint to test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-1000
Configuration saved in test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-1000/config.json
Model weights saved in test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-9000] due to args.save_total_limit
Saving model checkpoint to test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-1500
Configuration saved in test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-1500/config.json
Model weights saved in test_mlm/DAPT_batch512_lyric_con_sent/checkpoint-1500/pyto

time : 20777.952058553696


### Save model

In [29]:
trainer.save_model(trained_model_path)

Saving model checkpoint to test_mlm/DAPT_batch512_lyric_con_sent
Configuration saved in test_mlm/DAPT_batch512_lyric_con_sent/config.json
Model weights saved in test_mlm/DAPT_batch512_lyric_con_sent/pytorch_model.bin


### Upload hugging face

In [30]:
MODEL_SAVE_REPO = 'DAPT_batch512_lyric_con_sent' # ex) 'my-bert-fine-tuned'
HUGGINGFACE_AUTO_TOKEN = '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' # https://huggingface.co/settings/token
 
## Push to huggingface-hub
model.push_to_hub(
    MODEL_SAVE_REPO, 
    use_temp_dir=True, 
    use_auth_token=HUGGINGFACE_AUTO_TOKEN
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/JUNEYEOB/DAPT_batch512_lyric_con_sent into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Configuration saved in /tmp/tmpr62e0sc7/config.json
tcmalloc: large alloc 2190458880 bytes == 0x175f454000 @  0x7fbf60140680 0x7fbf60160da2 0x5f8dfc 0x64f870 0x527012 0x5c64c0 0x5f4cc1 0x5f4f85 0x486664 0x539ccb 0x539bf9 0x66321b 0x53a821 0x53a01f 0x6632cc 0x53a164 0x53a01f 0x66321b 0x53a164 0x53a8d8 0x66134d 0x6615f0 0x505166 0x56bbfa 0x569dba 0x5f6eb3 0x56bacd 0x569dba 0x5f6eb3 0x56bacd 0x569dba
Model weights saved in /tmp/tmpr62e0sc7/pytorch_model.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin:   0%|          | 32.0k/1.73G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/JUNEYEOB/DAPT_batch512_lyric_con_sent
   3fe078a..0132ab7  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/JUNEYEOB/DAPT_batch512_lyric_con_sent/commit/0132ab7fcc5b8df7a94031e9b4604e471f65ad5b'