<a href="https://colab.research.google.com/github/RayGone/SentimentAnalysis/blob/experiments/Experiments/LanguageModels/Nepali_BERT_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Large Uncased - Nepali

In [1]:
!pip install transformers tokenizers datasets huggingface_hub --quiet
!pip install -q kaggle

import numpy as np
import random
import os
import tensorflow as tf
from transformers import set_seed
import datasets
import gc

rand_seed = 9

def seed_everything(seed=0):
    random.seed(seed) # random
    os.environ['PYTHONHASHSEED'] = str(seed) # python enviroment
    np.random.seed(seed) # numpy
    tf.keras.utils.set_random_seed(seed) # tensorflow
    tf.random.set_seed(seed) # tensorflow
    set_seed(seed) # hugging_face transformer

seed_everything(rand_seed)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:
def pushToHub(thing,repo = None,token = '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'):
    if not repo:
        raise(Exception("Repo name not provided"))

    thing_type = str(type(thing))
    if not ('datasets' in thing_type or 'models' in thing_type or 'token' in thing_type):
        raise(Exception("Either a Dataset or a Model or a PreTrainedTokenizer can be pushed to hub.\nConfirm what you are trying to push!"))
    # login require python > 3.9
    from huggingface_hub import login
    login(token)

    thing.push_to_hub(repo)

In [3]:
from google.colab import files

## kaggle.json
files.upload()

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
# ! kaggle datasets list

Saving kaggle.json to kaggle.json


In [7]:
%%time
!kaggle kernels output reganmaharjan/preparing-bert-training-data -p /content/

Output file downloaded to /content/BERT_Training_Data/dataset_dict.json
Output file downloaded to /content/BERT_Training_Data/test/dataset.arrow
Output file downloaded to /content/BERT_Training_Data/test/dataset_info.json
Output file downloaded to /content/BERT_Training_Data/test/state.json
Output file downloaded to /content/BERT_Training_Data/train/dataset.arrow
Output file downloaded to /content/BERT_Training_Data/train/dataset_info.json
Output file downloaded to /content/BERT_Training_Data/train/state.json
Kernel log downloaded to /content/preparing-bert-training-data.log 
CPU times: user 2.08 s, sys: 330 ms, total: 2.41 s
Wall time: 8min 36s


In [8]:
%%time
import datasets #huggingface datasets

print("Loading Dataset")
data = datasets.load_from_disk('/content/BERT_Training_Data')
data

Loading Dataset
CPU times: user 41.6 ms, sys: 39.5 ms, total: 81.1 ms
Wall time: 96.5 ms


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 860233
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 891
    })
})

In [9]:
n_steps = 4

data_block_size = int(data['train'].num_rows/n_steps)
a,b = 0,1  # run batch 8 # running all batch at once
chunk = range(data_block_size*a,data_block_size*b)#data['train'].num_rows)#

print("Chunking data",chunk,"Batch:",b,"out of",n_steps)
data.cleanup_cache_files()
data['train'] = data['train'].select(chunk)
gc.collect()
data

Chunking data range(0, 215058) Batch: 1 out of 4


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 215058
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 891
    })
})

In [10]:
from transformers import AutoTokenizer

print("Loading Tokenizer")
model_id = 'raygx/BERT-Daju'
context_length = len(data['test'][0]['input_ids'])

try:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
except:
    tokenizer = AutoTokenizer.from_pretrained('raygx/BERT_Nepali_Tokenizer')
    tokenizer.model_max_length = context_length

tokenizer

Loading Tokenizer


Downloading (…)okenizer_config.json:   0%|          | 0.00/146 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='raygx/BERT_Nepali_Tokenizer', vocab_size=50000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [19]:
from transformers import DataCollatorForLanguageModeling

print("Initializing Data Collator")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True, mlm_probability=0.25,
                                                return_tensors="tf")

Initializing Data Collator


In [20]:
from transformers import TFAutoModelForMaskedLM, AutoConfig

try:  # In case of batched training
  model = TFAutoModelForMaskedLM.from_pretrained(model_id,
      sep_token_id=tokenizer.sep_token_id,
      pad_token_id=tokenizer.pad_token_id)
  print("Loading Self Checkpoint",model_id)
except:
  print("Loading Another Checkpoint","Shushant/nepaliBERT")
  model = TFAutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT",
      sep_token_id=tokenizer.sep_token_id,
      pad_token_id=tokenizer.pad_token_id, from_pt=True)

model.resize_token_embeddings(len(tokenizer))
print(model.config)
gc.collect()
model.summary()

Loading Another Checkpoint Shushant/nepaliBERT


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForMaskedLM: ['cls.predictions.decoder.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


BertConfig {
  "_name_or_path": "Shushant/nepaliBERT",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 3,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

Model: "tf_bert_for_masked_lm_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  123850752 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple  

In [21]:
print("Preparing Training and Testing sets to TRAIN the MODEL")
tf_train_set = model.prepare_tf_dataset(
    data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)
gc.collect()

Preparing Training and Testing sets to TRAIN the MODEL


483

In [22]:
from transformers import create_optimizer

num_train_steps = len(tf_train_set)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

## Training the Model

In [23]:
%%time

print("Training the model")
history = model.fit(x=tf_train_set,
          validation_data=tf_test_set,
          epochs=1)
model.save_pretrained("/Bert-nepali-maskedlm")
print(history.history)
gc.collect()

Training the model
{'loss': [7.089366436004639], 'val_loss': [6.859375]}
CPU times: user 43min 2s, sys: 13min 45s, total: 56min 48s
Wall time: 44min 14s


4

In [27]:
import math

eval_loss = model.evaluate(tf_test_set)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

pushToHub(model,repo=model_id)
pushToHub(tokenizer,repo=model_id)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [28]:
### Testing
from transformers import FillMaskPipeline


tokenizer('नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि')
pipeline = FillMaskPipeline(model=model,tokenizer=tokenizer,device=1)
pipeline('नेपाली [MASK] प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि')

[{'score': 0.0350341796875,
  'token': 666,
  'token_str': '।',
  'sequence': 'नेपाली । प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि'},
 {'score': 0.01560211181640625,
  'token': 33,
  'token_str': ',',
  'sequence': 'नेपाली, प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि'},
 {'score': 0.0147705078125,
  'token': 609,
  'token_str': 'छ',
  'sequence': 'नेपाली छ प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि'},
 {'score': 0.01119232177734375,
  'token': 629,
  'token_str': 'र',
  'sequence': 'नेपाली र प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि'},
 {'score': 0.01035308837890625,
  'token': 35,
  'token_str': '.',
  'sequence': 'नेपाली. प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि'}]