In [None]:
%config Completer.use_jedi = False

In [None]:
from transformers import BertForMaskedLM, AutoTokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
import random
import pandas as pd
import os

In [None]:
# set seeds
gpus = torch.cuda.device_count()
seed = 2022
random.seed(seed)
torch.manual_seed(seed)
if gpus > 0:
    torch.cuda.manual_seed_all(seed)
device = torch.device('cuda:0')

In [None]:
fin_bert_path = '../../tools/bert-base-finnish-uncased-v1/'
train_pretrain_data_path = '../../data/pretrain_data/pretrain_text_train.txt'
eval_pretrain_data_path = '../../data/pretrain_data/pretrain_text_eval.txt'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(fin_bert_path)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
model = BertForMaskedLM.from_pretrained(fin_bert_path)
if gpus > 1:
    model = torch.nn.DataParallel(model)
model = model.to(device)

In [None]:
train_set = LineByLineTextDataset(tokenizer=tokenizer, file_path=train_pretrain_data_path, block_size=128)
val_set = LineByLineTextDataset(tokenizer=tokenizer, file_path=eval_pretrain_data_path, block_size=128)

In [None]:
training_args = TrainingArguments(output_dir='../../save_models/pretrain_weight/', learning_rate=2e-5, evaluation_strategy="steps", logging_steps=20000, eval_steps=20000, overwrite_output_dir=True, num_train_epochs=2, per_device_train_batch_size=16, save_total_limit=1, save_strategy='no', load_best_model_at_end=True)

In [None]:
trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_set, eval_dataset=val_set)

In [None]:
trainer.train()

In [None]:
10148