In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!git clone https://github.com/Taeksu-Kim/Pre-training.git

fatal: destination path 'Pre-training' already exists and is not an empty directory.


In [1]:
cd Pre-training/Electra

/content/Pre-training/Electra


In [4]:
!mkdir my_data

mkdir: cannot create directory ‘my_data’: File exists


In [5]:
# 전체적인 동작 확인을 위한 작은 데이터셋
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_20190620_small.txt

file="./my_data/wiki_20190620_small.txt"

awk: cannot open ./cookie (No such file or directory)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1323k  100 1323k    0     0  1360k      0 --:--:-- --:--:-- --:--:-- 1360k


In [2]:
import gc
import os
import numpy as np
import random
from tqdm import tqdm
from time import time

import torch
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LambdaLR

from transformers import ElectraModel, ElectraConfig, ElectraForMaskedLM, ElectraTokenizerFast

# custom_lib
from custom_tokenizer import Build_Tokenizer
from processor import TextDatasetForNextSentencePrediction
from model import custom_Electra

In [3]:
def set_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

def set_cuda(deterministic=True):
    if torch.cuda.is_available():
        torch.backends.cudnn.deterministic = deterministic
        torch.backends.cudnn.benchmark = not deterministic

set_seed(42)
set_cuda()

In [4]:
input_courpus_files = ['./my_data/wiki_20190620_small.txt']
tokenizer_save_dir = './Tokenizer'
model_max_input_len = 512

Build_Tokenizer(input_courpus_files,
                tokenizer_save_dir,
                vocab_size=35000,
                special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
                model_max_input_len=512,
                clean_text=True,
                handle_chinese_chars=True,
                strip_accents=False,
                lowercase=False)

vocab.txt is saved in ./Tokenizer
Electra Tokenizer is saved in ./Tokenizer


In [5]:
tokenizer = ElectraTokenizerFast.from_pretrained(tokenizer_save_dir)

In [6]:
model_max_input_len = 512

dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path='/content/Pre-training/Electra/my_data/wiki_20190620_small.txt',
    block_size=model_max_input_len,
    num_min_lines=5,
    overwrite_cache=False,
    full_seq_probability=0.65,
)

In [43]:
batch_size = 4
epochs = 100
learning_rate = 1e-5
weight_decay = 1e-2
early_stopping_patience = 10

use_scaler = False
use_scheduler = False

save_name = 'electra_pretraining'

distributed_enabled = False
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [44]:
train_dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=True)

In [45]:
for batch in train_dataloader:
  break

In [46]:
batch

{'input_ids': tensor([[    2, 25287, 24311,  ...,     0,     0,     0],
         [    2,  9100,  1066,  ...,     0,     0,     0],
         [    2,  1928,    33,  ...,     0,     0,     0],
         [    2,   707,  4638,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [47]:
from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining

hidden_size = 768
embedding_size = 768

# generator config
gen_config = ElectraConfig()
gen_config.hidden_size = hidden_size
gen_config.embedding_size = embedding_size
gen_config.num_attention_heads = 4
gen_config.vocab_size = tokenizer.vocab_size

# discriminator config
disc_config = ElectraConfig()
disc_config.hidden_size = hidden_size
disc_config.embedding_size = embedding_size
disc_config.num_attention_heads = 12
disc_config.vocab_size = tokenizer.vocab_size

In [48]:
def tie_weights(generator, discriminator):
    generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings
    generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings
    generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings

In [49]:
generator = ElectraForMaskedLM(gen_config)
discriminator = ElectraForPreTraining(disc_config)
tie_weights(generator, discriminator)

In [50]:
model = custom_Electra(
    generator,
    discriminator,
    num_tokens = tokenizer.vocab_size,
    mask_token_id = tokenizer.mask_token_id,
    pad_token_id = tokenizer.pad_token_id,
    mask_prob = 0.15,
    mask_ignore_token_ids = [tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]']],
    random_token_prob = 0.0)

if distributed_enabled:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)

model.to(device)

custom_Electra(
  (generator): ElectraForMaskedLM(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(25722, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_features=768, out_feat

In [51]:
def get_params_without_weight_decay_ln(named_params, weight_decay):
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay,
        },
        {
            'params': [p for n, p in named_params if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
        },
    ]
    return optimizer_grouped_parameters

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    def lr_lambda(current_step):
        learning_rate = max(0.0, 1. - (float(current_step) / float(num_training_steps)))
        learning_rate *= min(1.0, float(current_step) / float(num_warmup_steps))
        return learning_rate
    return LambdaLR(optimizer, lr_lambda, last_epoch)

In [52]:
num_training_steps = len(train_dataloader) * epochs
num_warmup_steps = num_training_steps * 0.05

In [53]:
optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln(model.named_parameters(), weight_decay=weight_decay), lr=learning_rate)

In [54]:
scaler = torch.cuda.amp.GradScaler(enabled=True)

In [55]:
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [56]:
# class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'

In [57]:
def train_step(batch, epoch, training):
    batch = {key: value.to(device) for key, value in batch.items()}

    if training is True:
        model.train()
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model(**batch)

        if use_scaler:
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()

        else:
            loss.backward()
            optimizer.step()

        if use_scheduler:
            lr = scheduler.get_last_lr()[0]

        else:
            lr = optimizer.param_groups[0]["lr"]

        return loss, loss_mlm, loss_disc, acc_gen, acc_disc, round(lr, 10)

In [58]:
%%time
# train

loss_plot = []
lrs = []

check_list = []

best_loss = float('inf')

best_epoch = 0
patience = 0

step = 0

for epoch in range(epochs):
    gc.collect()
    total_loss = 0
    total_loss_mlm = 0
    total_loss_disc = 0
    total_acc_gen = 0
    total_acc_disc = 0

    
    tqdm_dataset = tqdm(enumerate(train_dataloader), total=train_dataloader.__len__())
    training = True
    for batch_idx, batch in tqdm_dataset:
        step += 1

        loss, loss_mlm, loss_disc, acc_gen, acc_disc, lr = train_step(batch, epoch, training)
        total_loss += loss
        total_loss_mlm += loss_mlm
        total_loss_disc += loss_disc
        total_acc_gen += acc_gen
        total_acc_disc += acc_disc

        
        tqdm_dataset.set_postfix({
            'step' : step,
            'Epoch': epoch + 1,
            GREEN + 'Loss' : '{:.4f}'.format(total_loss/(batch_idx+1)) + END,
            'mlm_loss' : '{:.4f}'.format(total_loss_mlm/(batch_idx+1)),
            'mlm_acc' : '{:.4f}'.format(total_acc_gen/(batch_idx+1)),
            'disc_loss' : '{:.4f}'.format(total_loss_disc/(batch_idx+1)),
            'disc_acc' : '{:.4f}'.format(total_acc_disc/(batch_idx+1)),
            'LR' : lr,
        })
            
    loss_plot.append(total_loss/(batch_idx+1))
    
    cur_loss = round(float((total_loss/(batch_idx+1)).detach().cpu()), 3)

    if cur_loss < best_loss:
        print(YELLOW + 'Best_loss is updated from {:>5} to {:>5} on epoch {}'.format(best_loss, cur_loss, epoch+1) + END)
        best_loss = cur_loss
        best_epoch = epoch+1
        torch.save(model.state_dict(), './'+save_name+'.ckpt')
        patience = 0
    else:
        patience += 1
    
    lrs.append(lr)
    
    if patience == early_stopping_patience:
        break

100%|██████████| 184/184 [01:35<00:00,  1.92it/s, step=184, Epoch=1, [92mLoss=40.7193[0m, mlm_loss=10.3590, mlm_acc=0.0000, disc_loss=0.6072, disc_acc=0.5017, LR=0]


[93mBest_loss is updated from   inf to 40.719 on epoch 1[0m


100%|██████████| 184/184 [01:35<00:00,  1.92it/s, step=368, Epoch=2, [92mLoss=40.7275[0m, mlm_loss=10.3662, mlm_acc=0.0000, disc_loss=0.6072, disc_acc=0.5015, LR=0]
100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=552, Epoch=3, [92mLoss=40.7169[0m, mlm_loss=10.3581, mlm_acc=0.0001, disc_loss=0.6072, disc_acc=0.5014, LR=0]


[93mBest_loss is updated from 40.719 to 40.717 on epoch 3[0m


100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=736, Epoch=4, [92mLoss=40.7099[0m, mlm_loss=10.3681, mlm_acc=0.0001, disc_loss=0.6068, disc_acc=0.5018, LR=0]


[93mBest_loss is updated from 40.717 to 40.71 on epoch 4[0m


100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=920, Epoch=5, [92mLoss=40.7161[0m, mlm_loss=10.3656, mlm_acc=0.0000, disc_loss=0.6070, disc_acc=0.5020, LR=0]
100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=1104, Epoch=6, [92mLoss=40.7225[0m, mlm_loss=10.3629, mlm_acc=0.0000, disc_loss=0.6072, disc_acc=0.5017, LR=0]
100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=1288, Epoch=7, [92mLoss=40.7172[0m, mlm_loss=10.3560, mlm_acc=0.0000, disc_loss=0.6072, disc_acc=0.5015, LR=0]
100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=1472, Epoch=8, [92mLoss=40.7142[0m, mlm_loss=10.3567, mlm_acc=0.0001, disc_loss=0.6071, disc_acc=0.5013, LR=0]
100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=1656, Epoch=9, [92mLoss=40.7138[0m, mlm_loss=10.3553, mlm_acc=0.0000, disc_loss=0.6072, disc_acc=0.5021, LR=0]
100%|██████████| 184/184 [01:35<00:00,  1.93it/s, step=1840, Epoch=10, [92mLoss=40.7176[0m, mlm_loss=10.3612, mlm_acc=0.0000, disc_loss=0.6071, disc_acc=0.5018

CPU times: user 16min 29s, sys: 6min 1s, total: 22min 30s
Wall time: 22min 23s





In [23]:
model.discriminator.save_pretrained('./electra_model')

In [24]:
mlm_model = ElectraForMaskedLM.from_pretrained('./electra_model')
mlm_model.eval()

inputs = tokenizer("한국의 수도는 [MASK]이다.", return_tensors="pt")

logits = mlm_model(**inputs)['logits']

tokenizer.decode(torch.argmax(logits[0,inputs.input_ids[0].tolist().index(tokenizer.mask_token_id)], dim=-1))

Some weights of the model checkpoint at ./electra_model were not used when initializing ElectraForMaskedLM: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at ./electra_model and are newly initialized: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', '

'자생한다'

In [25]:
disc_model = ElectraForPreTraining.from_pretrained('./electra_model')
disc_model.eval()

inputs = tokenizer("한국의 수도는 부산이다.", return_tensors="pt")

logits = disc_model(**inputs)['logits']
logits = torch.sigmoid(logits)

In [26]:
for i in range(len(inputs.input_ids[0].tolist())):
  print('{} : {}'.format(tokenizer.convert_ids_to_tokens(inputs.input_ids[0].tolist()[i]), round(float(logits[0][i]), 3)))

[CLS] : 0.417
한국의 : 0.415
수도는 : 0.412
부산 : 0.428
##이다 : 0.446
. : 0.399
[SEP] : 0.435
