# Data preprocessing and investigation

## Preprocess data (preprocess_data.py)
- Data contains the first 3GB of Oscar-Corpus-VN ('./data/oscar-text-3g.txt') and ~0.5GB Vietnews (in './data/train_tokenized')
- Pre-tokenize using nltk.word_tokenize and split first 3GB of Oscar-Corpus-VN (into './data/oscar-corpus')
- Rename (from *.txt.seg to *.txt) and Reformat Vietnews data to line-by-line dataset: title-summary-body text-captions (of image)

# Tokenizer

In [1]:
import os

from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import RobertaProcessing

tokenizer = Tokenizer(BPE())

tokenizer.normalizer = Sequence([
    NFKC()
])

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.post_processor = RobertaProcessing(sep=('</s>', 2), cls=('<s>', 0))
tokenizer.decoder = ByteLevelDecoder()

special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
# special_tokens += ["<unused{}>".format(i) for i in range(50)]
trainer = BpeTrainer(vocab_size=30005, special_tokens=special_tokens, min_frequency=10, show_progress=True)

In [2]:
train_files = [os.path.join('data/vietnews', file) for file in os.listdir('data/vietnews')]
train_files += [os.path.join('data/oscar-corpus', file) for file in sorted(os.listdir('data/oscar-corpus'))]
for file in train_files:
    if not os.path.isfile(file):
        raise IsADirectoryError(file)

tokenizer.train(train_files, trainer=trainer)






In [3]:
tokenizer.save('./save/tokenizer.json')

In [4]:
from underthesea import word_tokenize as underthesea_word_tokenize

def word_tokenize(text, format='text'):
    return underthesea_word_tokenize(text, format=format)

text = word_tokenize("Đây là một ví dụ khoong đảm bảo", format='list')
# text[3] = '<mask>'
# text[0] = '<mask>'
text = ' '.join([t.replace(' ', '_') for t in text])
encoding = tokenizer.encode(text)
print(encoding.tokens)
tokenizer.decode(encoding.ids)
# This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not
# So it has space at the beginning to avoid that.

2021-10-05 15:20:28.010151: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64
2021-10-05 15:20:28.010185: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


['<s>', 'ĠÄĲÃ¢y', 'ĠlÃł', 'Ġmá»Ļt', 'ĠvÃŃ', '_', 'dá»¥', 'Ġkho', 'ong', 'ĠÄĳáº£m', '_', 'báº£o', '</s>']


' Đây là một ví_dụ khoong đảm_bảo'

In [5]:
# convert to Roberta tokenizer
from transformers import RobertaTokenizerFast
tokenizer_ = RobertaTokenizerFast.from_pretrained('./save')
tokenizer_.save_pretrained('./save/viRoberta-l6-h384-word-cased/')

file ./save/config.json not found


('./save/viRoberta-l6-h384-word-cased/tokenizer_config.json',
 './save/viRoberta-l6-h384-word-cased/special_tokens_map.json',
 './save/viRoberta-l6-h384-word-cased/vocab.json',
 './save/viRoberta-l6-h384-word-cased/merges.txt',
 './save/viRoberta-l6-h384-word-cased/added_tokens.json',
 './save/viRoberta-l6-h384-word-cased/tokenizer.json')

# Test scripts

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from transformers import RobertaModel, RobertaTokenizerFast, RobertaTokenizer, RobertaConfig

tokenizer = RobertaTokenizer.from_pretrained('./save/viRoberta-l6-h384-word-cased')
tokenizer.model_max_length = 512
tokenizer.padding_side='right'

roberta_config = RobertaConfig()
roberta_config.max_position_embeddings = 514
roberta_config.hidden_act = "gelu"
roberta_config.hidden_size = 384
roberta_config.intermediate_size = 1536
roberta_config.num_hidden_layers = 6
roberta_config.vocab_size = tokenizer.vocab_size

roberta = RobertaModel(roberta_config)

In [4]:
from datasets import load_dataset

dataset = load_dataset('text', data_files={'train': [os.path.join('data/vietnews', file) for file in sorted(os.listdir('data/vietnews'))[:1000]],
                                           }, streaming=True)
# train_datasets = dataset['train'].shuffle(buffer_size=10000, seed=22)
# val_datasets = dataset['val'].shuffle(buffer_size=10000, seed=22)

Using custom data configuration default-26e92df873d26a89


Downloading and preparing dataset text/default to /home/aimenext/.cache/huggingface/datasets/text/default-26e92df873d26a89/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/aimenext/.cache/huggingface/datasets/text/default-26e92df873d26a89/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# from transformers.data.data_collator import DataCollatorForWholeWordMask
from transformers import TrainingArguments, Trainer

data_collator = DataCollatorForWholeWordMask(tokenizer, mlm_probability=0.15)

# approximate 5M sentences
training_arguments = TrainingArguments(
    output_dir='save/checkpoint-viRoberta-l6-h384-word-cased',
    do_train=True,
    # do_eval=True,
    # evaluation_strategy='steps',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.98,
    num_train_epochs=10,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    save_strategy='epoch',
    save_total_limit=3,
    seed=22,
    
)


trainer = Trainer(
    model=roberta,
    args=training_arguments,
    data_collator=data_collator,
    train_dataset=dataset['train'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
%tensorboard --logdir save/checkpoint-viRoberta-l6-h384-word-cased/runs

UsageError: Line magic function `%tensorboard` not found.


# Test model

In [1]:
from transformers import pipeline, RobertaModel, RobertaTokenizer

model_name = 'save/checkpoint-viRoberta-l6-h384-word-cased'

fill_mask = pipeline(
    "fill-mask",
    model = model_name,
    tokenizer=model_name,
)

# The sun <mask>.
# =>


2021-10-11 08:53:12.177656: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64
2021-10-11 08:53:12.177704: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [9]:
from underthesea import word_tokenize as underthesea_word_tokenize

def word_tokenize(text, format='text'):
    return underthesea_word_tokenize(text, format=format)

text = word_tokenize("Xin chào, tôi không còn là sinh viên đại học Bách Khoa.", format='text')
text

'Xin chào , tôi không còn là sinh_viên đại_học Bách_Khoa .'

In [23]:
text = 'Xin chào , tôi không còn là sinh_viên đại_học Bách_Khoa .'
# text = 'Tôi là sinh_viên tại đại_học Công_nghệ .'
# fill_mask(text)
model(**tokenizer(text, return_tensors='pt'))

MaskedLMOutput(loss=None, logits=tensor([[[ -0.1150, -11.5632,  -0.4403,  ...,  -2.8943,  -2.4880,  -3.1276],
         [ -1.1978,  -6.0072,  -0.9945,  ...,  -0.7873,  -1.2897,  -1.5818],
         [ -1.7333,  -6.4730,  -0.9527,  ...,  -1.1052,  -1.8015,  -2.9654],
         ...,
         [ -1.4610,  -7.2221,  -1.3914,  ...,  -1.5793,  -1.0386,  -0.8753],
         [ -2.4627, -10.9411,  -1.2275,  ...,  -3.0386,  -2.1031,  -2.2256],
         [ -1.0328, -10.2561,   3.2921,  ...,  -2.7305,  -1.8566,  -2.8576]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [27]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, AutoTokenizer, AutoModelForMaskedLM

model_name = 'save/checkpoint-viRoberta-l6-h384-word-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

model.push_to_hub('Zayt/viRoberta-l6-h384-word-cased', use_auth_token='api_nqAApltLEQsjqZMYYScqBxiEVHbXuMOSlL')
tokenizer.push_to_hub('Zayt/viRoberta-l6-h384-word-cased', use_auth_token='api_nqAApltLEQsjqZMYYScqBxiEVHbXuMOSlL')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin:   0%|          | 32.0k/86.0M [00:00<?, ?B/s]

git: 'credential-wincred' is not a git command. See 'git --help'.
To https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased
   e3313d8..743adbc  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

git: 'credential-wincred' is not a git command. See 'git --help'.
To https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased
   743adbc..4015e10  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased/commit/4015e106bb424334e2558f9f4746eb84dddc221c'

# Compare Speed

In [20]:
import time
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

roberta_name = 'vinai/phobert-base'
roberta_small_name = 'save/vietnews-checkpoint-viRoberta-l6-h384-word-cased'

def get_all_speed(model, tokenizer, docs):
    start = time.time()
    for doc in tqdm(docs):
        inputs = tokenizer(doc, truncation=True, max_length=256, return_tensors='pt')
        model(**inputs)
    return time.time() - start

with open('data/vietnews_merged/0.txt', encoding='utf8') as f:
    docs = f.readlines(300000)

In [21]:
roberta_small = AutoModel.from_pretrained(roberta_small_name)
roberta_small_tokenizer = AutoTokenizer.from_pretrained(roberta_small_name)

get_all_speed(roberta_small, roberta_small_tokenizer, docs)

100%|██████████| 491/491 [00:15<00:00, 31.45it/s]


15.616876363754272

In [22]:
roberta = AutoModel.from_pretrained(roberta_name)
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_name)

get_all_speed(roberta, roberta_tokenizer, docs)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 491/491 [00:39<00:00, 12.57it/s]


39.04964780807495