# 1. LOAD DATASET

In [4]:
import torch

from datasets import load_dataset
from tqdm.auto import tqdm, trange

dataset = load_dataset('bookcorpus', split='train')
dataset.set_format('pandas', ['text'])

Reusing dataset bookcorpus (C:\Users\tmuds\.cache\huggingface\datasets\bookcorpus\plain_text\1.0.0\af844be26c089fb64810e9f2cd841954fd8bd596d6ddd26326e4c70e2b8c96fc)


# 2. TRAIN TOKENIZER

In [4]:
from tokenizers.implementations import BertWordPieceTokenizer, ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files='bookcorpus.txt', vocab_size=50000, min_frequency=2, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])


In [5]:
tokenizer.save_model("BonzLM")

['BonzLM\\vocab.json', 'BonzLM\\merges.txt']

# 3. INIT LANGUAGE MODELING

## Load tokenizer

In [5]:
from transformers import BertTokenizerFast, RobertaTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('./tokenzier/WordPiece-10k/', max_len=256)

In [6]:
import torch

def create_position_ids_from_input_ids(input_ids, padding_idx):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    mask = input_ids.ne(padding_idx).int()
    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
    return incremental_indices.long() + padding_idx

## Reformer Config

In [7]:
from transformers import ReformerConfig

config = ReformerConfig(attention_head_size=64,
                        num_attention_heads=6,
                        num_hidden_layers=6,
                        num_buckets=8,
                        num_hashes=4,
                        axial_pos_embds_dim=[64, 64],
                        hidden_size=128,
                        axial_pos_shape=[16, 16],
                        max_position_embeddings= 256,
                        bos_token_id=2,
                        eos_token_id=3,
                        pad_token_id=0,
                        feed_foward_size=512,
                        hash_seed=None,
                        local_attn_chunk_length=32,
                        lsh_attn_chunk_length=32,
                        vocab_size=tokenizer.vocab_size
                       )
config

ReformerConfig {
  "attention_head_size": 64,
  "attn_layers": [
    "local",
    "lsh",
    "local",
    "lsh",
    "local",
    "lsh"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": true,
  "axial_pos_embds_dim": [
    64,
    64
  ],
  "axial_pos_shape": [
    16,
    16
  ],
  "bos_token_id": 2,
  "chunk_size_lm_head": 0,
  "eos_token_id": 3,
  "feed_forward_size": 512,
  "feed_foward_size": 512,
  "hash_seed": null,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.05,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "local_attention_probs_dropout_prob": 0.05,
  "local_attn_chunk_length": 32,
  "local_num_chunks_after": 0,
  "local_num_chunks_before": 1,
  "lsh_attention_probs_dropout_prob": 0.0,
  "lsh_attn_chunk_length": 32,
  "lsh_num_chunks_after": 0,
  "lsh_num_chunks_before": 1,
  "max_position_embeddings": 256,
  "model_type": "reformer",
  "num_attention_heads": 6,
  "num_buckets": 8,
  "num_hashes": 4,
  "num_hidden_layers": 6,
  "pad_to

## Init Reformer Model

In [8]:
from transformers import ReformerForMaskedLM

model = ReformerForMaskedLM(config=config)
model.num_parameters()

5678096

### Benchmark

In [5]:
from transformers import PyTorchBenchmarkArguments, PyTorchBenchmark

new_config = config
new_config.num_buckets = 16
benchmark_args = PyTorchBenchmarkArguments(sequence_lengths=[256], batch_sizes=[16,32,64,128,256,512,1024], models=["Reformer"], multi_process=False)
benchmark = PyTorchBenchmark(configs=[new_config], args=benchmark_args,)
result = benchmark.run()

1 / 1


KeyboardInterrupt: 

## Preparing Dataset

In [9]:
import pandas as pd

df = pd.read_csv('bookcorpus.txt', names=['text'])

In [10]:
import torch
from torch.utils.data import Dataset

class BonzDataset(Dataset):
    def __init__(self, df, tokenizer):
        super(BonzDataset, self).__init__()
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        input_ids = self.tokenizer.encode(self.df['text'][idx], padding='max_length', truncation=True, return_tensors='pt').squeeze(0)
        for i in range(1,1000):
            concated_text = ' '.join(df['text'][idx: idx+i+1].tolist())
            temp = self.tokenizer.encode(concated_text, padding='max_length', truncation=True, return_tensors='pt').squeeze(0)
            if temp[-1] == self.tokenizer.pad_token_id:
                input_ids = temp
            else:
                break
        return input_ids

dataset = BonzDataset(df, tokenizer)
dataset[0]

tensor([   2, 1476, 2530,   17, 6739, 2491, 1633, 1497, 1476, 2497, 1495, 3121,
        3204, 8052, 4837, 7228, 1506, 1015, 1577, 2201, 6235, 1011, 7559, 7185,
        7228, 1506, 1015, 1577, 2201, 6235, 1011, 1597, 5411, 8684,   18, 1547,
        1027, 1010,   30, 3668, 1033, 1042, 1033, 9398, 1044, 1034, 1032, 1547,
        1027, 1010,   17, 3601,   30,   29, 8568,   17, 3668, 1033, 1042, 1033,
        9398, 1044, 1034, 1034, 1533, 1559, 2411,   16, 1684, 9343, 1550, 1484,
        1896, 2399, 4396, 1533, 1559, 4712, 2365,   21, 4602, 9564, 1013, 3543,
        1484, 1501, 2645,   16, 1814,   35,   51, 3310,   51, 1561,   43, 2228,
        2860, 1484, 1521, 2573,   18, 3338, 2102, 1013,   16, 1901, 4838, 1547,
        1563, 1476, 2093, 1502, 1018, 3194, 1903, 1484, 3222,   18, 1968,   43,
        2161, 2802, 2909,   16, 1476, 2321, 1839, 2476, 3365, 1649, 2338,   18,
        1968,   43, 2093, 1839, 1649, 3075, 2547, 1014, 1778, 2247, 1638, 1502,
        4415, 1599, 3182, 4287, 1623, 15

## Defining DataCollator & Trainer

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [13]:
from transformers import Trainer, TrainingArguments

BATCH_SIZE = 256
MAX_STEP = (100000 * 256) // BATCH_SIZE
WARMUP_STEP = (MAX_STEP*0.05) // 1
LEARNING_RATE = 5e-5

training_args = TrainingArguments(
    output_dir="./BonzLM",
    overwrite_output_dir=True,
    # Training step
    max_steps=MAX_STEP,
    warmup_steps=WARMUP_STEP,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=10000,
    save_total_limit=2,
    # mixed precision
    fp16=True,
    fp16_opt_level='O2',
    seed=42,
    # Learning rate setup
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    max_grad_norm=0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

In [None]:
%%time
trainer.train()

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
./BonzLM


wandb: Currently logged in as: duyduc1110 (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.11 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0




Step,Training Loss
500,9.264078
1000,9.264594
1500,9.264578
2000,9.264437
2500,9.264188
3000,9.264375
3500,9.264391
4000,9.264297
4500,9.264188
5000,9.264328


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


In [20]:
trainer.save_model('test2')

In [19]:
trainer.tokenizer = tokenizer

# 4. TEST MODEL WITH MASK TOKEN

In [31]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./test2",
    tokenizer=tokenizer
)

In [34]:
fill_mask("Today is a <mask> day.")

[{'sequence': '<s>Today is a`` day.</s>',
  'score': 0.2213928997516632,
  'token': 308,
  'token_str': '``'},
 {'sequence': '<s>Today is a very day.</s>',
  'score': 0.0527605339884758,
  'token': 832,
  'token_str': 'Ġvery'},
 {'sequence': '<s>Today is athis day.</s>',
  'score': 0.03341570124030113,
  'token': 954,
  'token_str': 'this'},
 {'sequence': '<s>Today is awhat day.</s>',
  'score': 0.0221271850168705,
  'token': 780,
  'token_str': 'what'},
 {'sequence': '<s>Today is athe day.</s>',
  'score': 0.01886608637869358,
  'token': 350,
  'token_str': 'the'}]

In [43]:
%env WANDB_PROJECT=huggingface-demo
%env GLUE_DIR=glue_data
%env TASK_NAME=MRPC

!python run_glue.py \
  --model_name_or_path ./test2 \
  --task_name MRPC \
  --do_train \
  --do_eval \
  --max_seq_length 256 \
  --per_device_train_batch_size 32 \
  --learning_rate 2e-4 \
  --num_train_epochs 3 \
  --output_dir /tmp/$TASK_NAME/ \
  --overwrite_output_dir \
  --logging_steps 50

env: WANDB_PROJECT=huggingface-demo
env: GLUE_DIR=glue_data
env: TASK_NAME=MRPC


11/24/2020 14:30:01 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/tmp/$TASK_NAME/', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0002, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs\\Nov24_14-30-01_DS3', logging_first_step=False, logging_steps=50, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=0, past_index=-1, run_nam

In [40]:
trainer.model.save_pretrained('./test2')

In [42]:
tokenizer.save_pretrained('./test2')

('./test2\\tokenizer_config.json',
 './test2\\special_tokens_map.json',
 './test2\\vocab.json',
 './test2\\merges.txt',
 './test2\\added_tokens.json')