<a href="https://colab.research.google.com/github/RayGone/SentimentAnalysis/blob/main/Experiments/Nepali_MaskedLM-BertL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Large Uncased - Nepali

In [None]:
!pip install transformers tokenizers datasets huggingface_hub --quiet

import numpy as np
import random
import os
import tensorflow as tf
from transformers import set_seed
import datasets

rand_seed = 9

def seed_everything(seed=0):
    random.seed(seed) # random
    os.environ['PYTHONHASHSEED'] = str(seed) # python enviroment
    np.random.seed(seed) # numpy
    tf.keras.utils.set_random_seed(seed) # tensorflow
    tf.random.set_seed(seed) # tensorflow
    set_seed(seed) # hugging_face transformer

seed_everything(rand_seed)

In [None]:
%%time
import datasets #huggingface datasets

print("Loading Dataset")
data1 = datasets.load_dataset("raygx/Nepali-Text-Corpus")
data1 = data1.filter(lambda x: x['text']!=None,num_proc=4)
print(data1)
data2 = datasets.load_dataset("cc100", lang="ne")
print(data2)

In [None]:
data = datasets.concatenate_datasets([data1['train'], data2['train']])
data = data.shuffle(999).train_test_split(test_size=0.001)
gc.collect()
data

In [None]:
# #### BAG of words computation
# from tqdm.auto import tqdm

# bag_of_words = set()

# for i in tqdm(range(0,data['train'].num_rows,50000)):
#     j = i+100000
#     j = j if j<data['train'].num_rows else data['train'].num_rows

#     bag_of_words = set(list(bag_of_words)+(" ".join(data['train'].select(range(i,j))['text'])).split())


# len(bag_of_words)  ## 4966875 words in the bag

In [None]:
from transformers import BertTokenizerFast

print("Loading Tokenizer")
tokenizer = BertTokenizerFast.from_pretrained('raygx/Nepali-GPT2-CausalLM')
tokenizer.add_special_tokens({'pad_token': '[PAD]',"eos_token": "[SEP]", "bos_token":"[CLS]", "mask_token":"[MASK]"})

In [None]:
def preprocess_function(rows):
    return tokenizer(rows['text'])


In [None]:
%%time
print("Tokenizing the data")
tokenized_inputs = data.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=data["train"].column_names
)
# tokenized_inputs = tokenized_inputs.remove_columns(['token_type_ids'])
tokenized_inputs

In [None]:
block_size = 128
gc.collect()

def group_texts(rows):
    # Concatenate all texts.
    concatenated_rows = {k: sum(rows[k], []) for k in rows.keys()}
    total_length = len(concatenated_rows[list(rows.keys())[0]])
    remainder = total_length

    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
        remainder -=total_length

    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_rows.items()
    }

    if(remainder):
        for k in result.keys():
            result[k].append(concatenated_rows[k][-128:])

    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
%%time
print("Grouping Tokens to Model Input Size")
lm_data = tokenized_inputs.map(group_texts, batched=True, num_proc=4)
lm_data

In [None]:
from transformers import DataCollatorForLanguageModeling

print("Initializing Data Collator")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True, mlm_probability=0.25,
                                                return_tensors="tf")

In [None]:
from transformers import TFAutoModelForMaskedLM, AutoConfig

model = TFAutoModelForMaskedLM.from_pretrained("bert-large-uncased",
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id)

model.resize_token_embeddings(len(tokenizer))
print(model.config)
gc.collect()
model.summary()

In [None]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.0099)
model.compile(optimizer=optimizer)

In [None]:
print("Preparing Training and Testing sets to TRAIN the MODEL")
tf_train_set = model.prepare_tf_dataset(
    lm_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)
gc.collect()

## Training the Model

In [None]:
%%time

print("Training the model")
history = model.fit(x=tf_train_set,
          validation_data=tf_test_set,
          epochs=1)
model.save_pretrained("/LBert-nepali-maskedlm")
print(history.history)
gc.collect()

In [None]:
from seaborn import lineplot
from matplotlib import pyplot as plt

lineplot(history.history['loss'])
lineplot(history.history['val_loss'])

plt.plot()

In [None]:
from huggingface_hub import login

login("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")
model.push_to_hub('raygx/BertL-Nepali')
tokenizer.push_to_hub('raygx/BertL-Nepali')

In [None]:
### Testing
from transformers import FillMaskPipeline


tokenizer('नेपाली भान्सामा प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि')
pipeline = FillMaskPipeline(model=model,tokenizer=tokenizer,device=1)
pipeline('नेपाली [MASK] प्रयोग हुने सुगन्धित धनियाँ पाँच हजार वर्ष अघिदेखि')