# Data preprocessing and investigation

## Preprocess data (preprocess_data.py)
- Data contains the first 3GB of Oscar-Corpus-VN ('./data/oscar-text-3g.txt') and ~0.5GB Vietnews (in './data/train_tokenized')
- Pre-tokenize using nltk.word_tokenize and split first 3GB of Oscar-Corpus-VN (into './data/oscar-corpus')
- Rename (from *.txt.seg to *.txt) and Reformat Vietnews data to line-by-line dataset: title-summary-body text-captions (of image)

# Tokenizer

In [1]:
import os

from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import RobertaProcessing

tokenizer = Tokenizer(BPE())

tokenizer.normalizer = Sequence([
    NFKC()
])

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.post_processor = RobertaProcessing(sep=('</s>', 2), cls=('<s>', 0))
tokenizer.decoder = ByteLevelDecoder()

special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
# special_tokens += ["<unused{}>".format(i) for i in range(50)]
trainer = BpeTrainer(vocab_size=30005, special_tokens=special_tokens, min_frequency=10, show_progress=True)

In [2]:
train_files = [os.path.join('data/vietnews', file) for file in os.listdir('data/vietnews')]
train_files += [os.path.join('data/oscar-corpus', file) for file in sorted(os.listdir('data/oscar-corpus'))]
for file in train_files:
    if not os.path.isfile(file):
        raise IsADirectoryError(file)

tokenizer.train(train_files, trainer=trainer)






In [3]:
tokenizer.save('./save/tokenizer.json')

In [4]:
from underthesea import word_tokenize as underthesea_word_tokenize

def word_tokenize(text, format='text'):
    return underthesea_word_tokenize(text, format=format)

text = word_tokenize("Đây là một ví dụ khoong đảm bảo", format='list')
# text[3] = '<mask>'
# text[0] = '<mask>'
text = ' '.join([t.replace(' ', '_') for t in text])
encoding = tokenizer.encode(text)
print(encoding.tokens)
tokenizer.decode(encoding.ids)
# This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not
# So it has space at the beginning to avoid that.

2021-10-05 15:20:28.010151: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64
2021-10-05 15:20:28.010185: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


['<s>', 'ĠÄĲÃ¢y', 'ĠlÃł', 'Ġmá»Ļt', 'ĠvÃŃ', '_', 'dá»¥', 'Ġkho', 'ong', 'ĠÄĳáº£m', '_', 'báº£o', '</s>']


' Đây là một ví_dụ khoong đảm_bảo'

In [5]:
# convert to Roberta tokenizer
from transformers import RobertaTokenizerFast
tokenizer_ = RobertaTokenizerFast.from_pretrained('./save')
tokenizer_.save_pretrained('./save/viRoberta-l6-h384-word-cased/')

file ./save/config.json not found


('./save/viRoberta-l6-h384-word-cased/tokenizer_config.json',
 './save/viRoberta-l6-h384-word-cased/special_tokens_map.json',
 './save/viRoberta-l6-h384-word-cased/vocab.json',
 './save/viRoberta-l6-h384-word-cased/merges.txt',
 './save/viRoberta-l6-h384-word-cased/added_tokens.json',
 './save/viRoberta-l6-h384-word-cased/tokenizer.json')

# Test scripts

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from transformers import RobertaModel, RobertaTokenizerFast, RobertaTokenizer, RobertaConfig

tokenizer = RobertaTokenizer.from_pretrained('./save/viRoberta-l6-h384-word-cased')
tokenizer.model_max_length = 512
tokenizer.padding_side='right'

roberta_config = RobertaConfig()
roberta_config.max_position_embeddings = 514
roberta_config.hidden_act = "gelu"
roberta_config.hidden_size = 384
roberta_config.intermediate_size = 1536
roberta_config.num_hidden_layers = 6
roberta_config.vocab_size = tokenizer.vocab_size

roberta = RobertaModel(roberta_config)

In [4]:
from datasets import load_dataset

dataset = load_dataset('text', data_files={'train': [os.path.join('data/vietnews', file) for file in sorted(os.listdir('data/vietnews'))[:1000]],
                                           }, streaming=True)
# train_datasets = dataset['train'].shuffle(buffer_size=10000, seed=22)
# val_datasets = dataset['val'].shuffle(buffer_size=10000, seed=22)

Using custom data configuration default-26e92df873d26a89


Downloading and preparing dataset text/default to /home/aimenext/.cache/huggingface/datasets/text/default-26e92df873d26a89/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/aimenext/.cache/huggingface/datasets/text/default-26e92df873d26a89/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# from transformers.data.data_collator import DataCollatorForWholeWordMask
from transformers import TrainingArguments, Trainer

data_collator = DataCollatorForWholeWordMask(tokenizer, mlm_probability=0.15)

# approximate 5M sentences
training_arguments = TrainingArguments(
    output_dir='save/checkpoint-viRoberta-l6-h384-word-cased',
    do_train=True,
    # do_eval=True,
    # evaluation_strategy='steps',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.98,
    num_train_epochs=10,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    save_strategy='epoch',
    save_total_limit=3,
    seed=22,
    
)


trainer = Trainer(
    model=roberta,
    args=training_arguments,
    data_collator=data_collator,
    train_dataset=dataset['train'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
%tensorboard --logdir save/checkpoint-viRoberta-l6-h384-word-cased/runs

UsageError: Line magic function `%tensorboard` not found.


# Test model

In [11]:
from transformers import pipeline, RobertaModel, RobertaTokenizer


fill_mask = pipeline(
    "fill-mask",
    model = "./save/checkpoint-viRoberta-l6-h384-word-cased",
    tokenizer="./save/checkpoint-viRoberta-l6-h384-word-cased",
)

# The sun <mask>.
# =>


In [9]:
from underthesea import word_tokenize as underthesea_word_tokenize

def word_tokenize(text, format='text'):
    return underthesea_word_tokenize(text, format=format)

text = word_tokenize("Xin chào, tôi không còn là sinh viên đại học Bách Khoa.", format='text')
text

'Xin chào , tôi không còn là sinh_viên đại_học Bách_Khoa .'

In [12]:
text = 'Xin chào , tôi không còn là <mask> đại_học Bách_Khoa .'
fill_mask(text)

[{'sequence': 'Xin chào, tôi không còn là_ đại_học Bách_Khoa.',
  'score': 0.17441324889659882,
  'token': 67,
  'token_str': '_'},
 {'sequence': 'Xin chào, tôi không còn là, đại_học Bách_Khoa.',
  'score': 0.026248406618833542,
  'token': 253,
  'token_str': ','},
 {'sequence': 'Xin chào, tôi không còn là. đại_học Bách_Khoa.',
  'score': 0.0072890338487923145,
  'token': 264,
  'token_str': '.'},
 {'sequence': 'Xin chào, tôi không còn là của đại_học Bách_Khoa.',
  'score': 0.004515592474490404,
  'token': 307,
  'token_str': ' của'},
 {'sequence': 'Xin chào, tôi không còn là và đại_học Bách_Khoa.',
  'score': 0.004377501085400581,
  'token': 285,
  'token_str': ' và'}]

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('save/vietnews-checkpoint-viRoberta-l6-h384-word-cased')
tokenizer.tokenize(text)

['<mask>',
 'Ġhá»įc',
 '_',
 'phÃŃ',
 'Ġtáº¡i',
 'ĠBÃ¡ch',
 '_',
 'Khoa',
 'Ġcá»©',
 'ĠtÄĥng',
 'Ġ?']