# Training DistilBert model from huggingface using Nepali Dataset for Masked Language Modelling
### Dataset used here is mixture of [Oscar Corpus](https://www.kaggle.com/datasets/hsebarp/oscar-corpus-nepali), [NepCov19Tweets dataset](https://www.kaggle.com/datasets/mathew11111/nepcov19tweets), [Nepali News dataset large](https://www.kaggle.com/datasets/ashokpant/nepali-news-dataset-large), [Nepali News dataset](https://www.kaggle.com/datasets/lotusacharya/nepalinewsdataset), [nepali-wikipedia-articles](https://www.kaggle.com/datasets/disisbig/nepali-wikipedia-articles), [urdu-nepali-parallel-corpus](https://www.kaggle.com/datasets/rtatman/urdunepali-parallel-corpus), [cc100](https://huggingface.co/datasets/cc100), [NepQuake15](github.com), [Sahitya](github.com) and health news datasets
> ### I cleaned Oscar corpus (as much as possible) in this [Notebook](https://www.kaggle.com/code/reganmaharjan/cleaning-oscar-nepali-dataset).
> ### The dataset in the input is merged and taken from this [Notebook](https://www.kaggle.com/code/reganmaharjan/tokenizer-nepcov19tweets/notebook).
### Tokenizers are trained on this [Notebook](https://www.kaggle.com/code/reganmaharjan/nepali-tokenizers-4-transformers)


In [1]:
import os
import random
import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
import datasets
from transformers import set_seed

model_id = 'raygx/distilBERT-Nepali'
rand_seed = 9

def seed_everything(seed=0):
    random.seed(seed) # random
    os.environ['PYTHONHASHSEED'] = str(seed) # python enviroment
    np.random.seed(seed) # numpy
    tf.keras.utils.set_random_seed(seed) # tensorflow
    tf.random.set_seed(seed) # tensorflow
    set_seed(seed) # hugging_face transformer

seed_everything(rand_seed)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [18]:
def pushToHub(thing,repo = None,token = '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'): 
    if not repo:
        raise(Exception("Repo name not provided"))
        
    thing_type = str(type(thing))
    if not ('datasets' in thing_type or 'models' in thing_type or 'token' in thing_type):
        raise(Exception("Either a Dataset or a Model can be pushed to hub.\nConfirm what you are trying to push!"))
    # login require python > 3.9 
    from huggingface_hub import login
    login(token)

    thing.push_to_hub(repo)

In [3]:
%%time

## load from input
data = datasets.load_from_disk('/kaggle/input/preparing-bert-training-data/BERT_Training_Data')
## save to working directory - input is readonly
data.save_to_disk('training_data')

CPU times: user 891 ms, sys: 8.21 s, total: 9.1 s
Wall time: 1min 32s


In [4]:
## load data from working directory
data = datasets.load_from_disk('training_data')
data = datasets.concatenate_datasets([data['train'],data['test']])
data

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 861124
})

## Faced a bottleneck
> Memory not enough to load all the data for MLM 

*Note: Remember to change bn variable passed to select()*

*Note: The reason for the memory exhaustion was due to the batch size of training and validation set on model.fit()*


In [5]:
n_steps = 3

data_block_size = int(data.num_rows/n_steps)
a,b = 0,1  # run batch 4 # running all batch at once
chunk = range(data_block_size*a,data_block_size*b)#data['train'].num_rows)#

print("Chunking data",chunk,"Batch:",b,"out of",n_steps)
data.cleanup_cache_files()
data = data.select(chunk).shuffle(rand_seed).train_test_split(test_size=0.01)
gc.collect()
data

Chunking data range(0, 287041) Batch: 1 out of 3


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 284170
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2871
    })
})

In [6]:
####This is for testing purpose only  -- comment when saving new version
# data = datasets.DatasetDict({
#     "train":data['train'].select(
#         range(int(data['train'].num_rows/10))
#     ),
#     "test":data['test'].select(
#         range(int(data['test'].num_rows/10))
#     )
# })
# data

**Loading Tokenizers**

In [7]:
from transformers import AutoTokenizer

context_length = 512

print("Loading Tokenizer")
try:
    if not a:
        raise(Exception('Error')) ## Load new tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
except:    
    tokenizer = AutoTokenizer.from_pretrained('raygx/BERT_Nepali_Tokenizer')
    tokenizer.model_max_length = context_length

tokenizer

Loading Tokenizer


Downloading (…)okenizer_config.json:   0%|          | 0.00/146 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='raygx/BERT_Nepali_Tokenizer', vocab_size=50000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [8]:
from transformers import DataCollatorForLanguageModeling

print("Initializing Data Collator")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, mlm_probability=0.25,
                                                return_tensors="tf")

Initializing Data Collator


In [9]:
from transformers import TFAutoModelForMaskedLM, AutoConfig

## To change the size of embedding - N_EMBED must me properly divisible by the size N_HEAD value
print("Initializing Model")
model = TFAutoModelForMaskedLM.from_pretrained(model_id)

model.resize_token_embeddings(len(tokenizer))
print(model.config)
model.summary()

Initializing Model


Downloading (…)lve/main/config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/360M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForMaskedLM.

All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at raygx/distilBERT-Nepali.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


DistilBertConfig {
  "_name_or_path": "raygx/distilBERT-Nepali",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "dim": 768,
  "dropout": 0.1,
  "eos_token_id": 2,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 2,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.30.2",
  "vocab_size": 50000
}

Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 81321984  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple           

In [10]:
print("Preparing Training and Testing sets to TRAIN the MODEL")
tf_train_set = model.prepare_tf_dataset(
    data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)
gc.collect()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Preparing Training and Testing sets to TRAIN the MODEL


90

In [11]:
from transformers import create_optimizer

num_train_steps = len(tf_train_set)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

## Training the Model

In [12]:
%%time

print("Training the model")
history = model.fit(x=tf_train_set, 
          validation_data=tf_test_set,
          #verbose=1,
          epochs=1)

model.save_pretrained(model_id)
print(history.history)

Training the model
{'loss': [4.8605146408081055], 'val_loss': [4.051042556762695]}
CPU times: user 3h 5min 2s, sys: 2min 27s, total: 3h 7min 29s
Wall time: 4h 4min 55s


In [13]:
# from seaborn import lineplot
# from matplotlib import pyplot as plt

# lineplot(history.history['loss'])
# lineplot(history.history['val_loss'])

# plt.plot()

In [14]:
import math

eval_loss = model.evaluate(tf_test_set)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 56.96


In [15]:
import shutil

# Deleting dataset directory 
dir_path = r"/kaggle/working/training_data"
shutil.rmtree(dir_path, ignore_errors=True)

In [20]:
### Testing
from transformers import FillMaskPipeline

tokenizer.model_input_names = ['input_ids','attention_mask']
pipeline = FillMaskPipeline(model=model,tokenizer=tokenizer,device=1)
pipeline('नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच [MASK] वर्ष अघिदेखि')

[{'score': 0.24879856407642365,
  'token': 5493,
  'token_str': 'सय',
  'sequence': 'नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच सय वर्ष अघिदेखि'},
 {'score': 0.14424605667591095,
  'token': 5175,
  'token_str': 'वर्ष',
  'sequence': 'नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच वर्ष वर्ष अघिदेखि'},
 {'score': 0.09164553135633469,
  'token': 668,
  'token_str': '०',
  'sequence': 'नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच ० वर्ष अघिदेखि'},
 {'score': 0.061376817524433136,
  'token': 5375,
  'token_str': 'हजार',
  'sequence': 'नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि'},
 {'score': 0.030752407386898994,
  'token': 673,
  'token_str': '५',
  'sequence': 'नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच ५ वर्ष अघिदेखि'}]

In [21]:
pushToHub(model,repo=model_id)
pushToHub(tokenizer,repo=model_id)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
