# BERT

In [1]:
!pip install transformers datasets evaluate huggingface-hub --quiet

[0m

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import tensorflow as tf
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/nepali-tokenizers/Nepali_Wordpiece.tokenizer
/kaggle/input/nepali-tokenizers/Nepali_BPE.tokenizer


### Decided to use BPE tokenizer for following reasons
1. As it was observed in [Tokenizer Training Notebook](https://www.kaggle.com/code/reganmaharjan/tokenizer-nepcov19tweets), BPE is faster than WordPiece.
2. As it was observed in [Testing Tokenizer](https://www.kaggle.com/code/reganmaharjan/testing-tokenizer-nepali/) word breaks by WordPiece are in unusual place than in BPE. Though the observation is not exhaustive and tokenizer vocab is not scanned completely.
3. and surprisinlgy the token ids for the tokens in vocab are almost identical in both BPE and Wordpiece.

In [35]:
%%time
import datasets #huggingface datasets

print("Loading Dataset")
data1 = datasets.load_dataset("raygx/Nepali-Text-Corpus")
data1 = data1.filter(lambda x: x['text']!=None,num_proc=4)
print(data1)
data2 = datasets.load_dataset("cc100", lang="ne")
print(data2)

Loading Dataset


  0%|          | 0/1 [00:00<?, ?it/s]

     

#0:   0%|          | 0/474 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/474 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/474 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/474 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1895289
    })
})


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 12732810
    })
})
CPU times: user 1.51 s, sys: 416 ms, total: 1.92 s
Wall time: 9.27 s


In [36]:
data = datasets.concatenate_datasets([data1['train'], data2['train']])
data = data.shuffle(999).train_test_split(test_size=0.001)
gc.collect()
data

DatasetDict({
    train: Dataset({
        features: ['text', 'id'],
        num_rows: 14613470
    })
    test: Dataset({
        features: ['text', 'id'],
        num_rows: 14629
    })
})

In [37]:
# #### BAG of words computation
# from tqdm.auto import tqdm

# bag_of_words = set()

# for i in tqdm(range(0,data['train'].num_rows,50000)):
#     j = i+100000
#     j = j if j<data['train'].num_rows else data['train'].num_rows
    
#     bag_of_words = set(list(bag_of_words)+(" ".join(data['train'].select(range(i,j))['text'])).split())
    

# len(bag_of_words)  ## 4966875 words in the bag

  0%|          | 0/293 [00:00<?, ?it/s]

4966875

In [38]:
from transformers import BertTokenizerFast

print("Initializing tokenizer as PreTrainedTokenizerFast")
tokenizer = BertTokenizerFast.from_pretrained('raygx/Nepali-GPT2-CausalLM')
tokenizer.add_special_tokens({'pad_token': '[PAD]',"eos_token": "[SEP]", "bos_token":"[CLS]", "mask_token":"[MASK]"})

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


Initializing tokenizer as PreTrainedTokenizerFast


0

In [39]:
def preprocess_function(rows):
    return tokenizer(rows['text'])


In [40]:
%%time
print("Tokenizing the data")
tokenized_inputs = data.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=data["train"].column_names
)
# tokenized_inputs = tokenized_inputs.remove_columns(['token_type_ids'])
tokenized_inputs

Tokenizing the data
     

#0:   0%|          | 0/3654 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/3654 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3654 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/3654 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/4 [00:00<?, ?ba/s]

#1:   0%|          | 0/4 [00:00<?, ?ba/s]

#2:   0%|          | 0/4 [00:00<?, ?ba/s]

#3:   0%|          | 0/4 [00:00<?, ?ba/s]

CPU times: user 31.4 s, sys: 10.4 s, total: 41.8 s
Wall time: 16min 15s


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14613470
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14629
    })
})

In [41]:
block_size = 128
gc.collect()

def group_texts(rows):
    # Concatenate all texts.
    concatenated_rows = {k: sum(rows[k], []) for k in rows.keys()}
    total_length = len(concatenated_rows[list(rows.keys())[0]])
    remainder = total_length
    
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
        remainder -=total_length
        
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_rows.items()
    }
    
    if(remainder):
        for k in result.keys():
            result[k].append(concatenated_rows[k][-128:])
        
    result["labels"] = result["input_ids"].copy()
    return result

In [42]:
%%time
print("Grouping Tokens to Model Input Size")
lm_data = tokenized_inputs.map(group_texts, batched=True, num_proc=4)
lm_data

Grouping Tokens to Model Input Size
     

#0:   0%|          | 0/3654 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/3654 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3654 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/3654 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/4 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/4 [00:00<?, ?ba/s]

#2:   0%|          | 0/4 [00:00<?, ?ba/s]

#3:   0%|          | 0/4 [00:00<?, ?ba/s]

CPU times: user 31.1 s, sys: 10.3 s, total: 41.4 s
Wall time: 25min 13s


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3398000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3342
    })
})

In [45]:
from transformers import DataCollatorForLanguageModeling

print("Initializing Data Collator")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, mlm_probability=0.25,
                                                return_tensors="tf")

Initializing Data Collator


In [44]:
from transformers import TFAutoModelForMaskedLM, AutoConfig

model = TFAutoModelForMaskedLM.from_pretrained("bert-large-uncased",
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id)

model.resize_token_embeddings(len(tokenizer))
print(model.config)
gc.collect()
model.summary()

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


BertConfig {
  "_name_or_path": "bert-large-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  354037760 
                                                                 
 mlm___cls (T

In [47]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.0001)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [46]:
print("Preparing Training and Testing sets to TRAIN the MODEL")
tf_train_set = model.prepare_tf_dataset(
    lm_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)
gc.collect()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Preparing Training and Testing sets to TRAIN the MODEL


0

## Training the Model

In [None]:
%%time

print("Training the model")
history = model.fit(x=tf_train_set, 
          validation_data=tf_test_set,
          verbose=1,
          epochs=5)
model.save_pretrained("/kaggle/working/LBert-nepali-maskedlm")
print(history.history)
gc.collect()

Training the model
Epoch 1/5
     3/212375 [..............................] - ETA: 2326:05:06 - loss: 12.2584

In [None]:
from seaborn import lineplot
from matplotlib import pyplot as plt

lineplot(history.history['loss'])
lineplot(history.history['val_loss'])

plt.plot()

In [None]:
from huggingface_hub import login

login("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")
model.push_to_hub('raygx/BertL-Nepali')
tokenizer.push_to_hub('raygx/BertL-Nepali')

In [None]:
import shutil

# Deleting dataset directory 
dir_path = r"/kaggle/working/corpus"
shutil.rmtree(dir_path, ignore_errors=True)

In [None]:
### Testing
from transformers import FillMaskPipeline


tokenizer('नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि')
pipeline = FillMaskPipeline(model=model,tokenizer=tokenizer,device=1)
pipeline('नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच [MASK] वर्ष अघिदेखि')