# Training DistillGPT2 model from huggingface using Nepali Dataset for Causal Language Modelling
### Dataset used here is mixture of [Oscar Corpus](https://www.kaggle.com/datasets/hsebarp/oscar-corpus-nepali), [NepCov19Tweets dataset](https://www.kaggle.com/datasets/mathew11111/nepcov19tweets), [Nepali News dataset large](https://www.kaggle.com/datasets/ashokpant/nepali-news-dataset-large), [Nepali News dataset](https://www.kaggle.com/datasets/lotusacharya/nepalinewsdataset), [nepali-wikipedia-articles](https://www.kaggle.com/datasets/disisbig/nepali-wikipedia-articles), [urdu-nepali-parallel-corpus](https://www.kaggle.com/datasets/rtatman/urdunepali-parallel-corpus), [cc100](https://huggingface.co/datasets/cc100), [NepQuake15](github.com), [Sahitya](github.com) and health news datasets
> ### I cleaned Oscar corpus (as much as possible) in this [Notebook](https://www.kaggle.com/code/reganmaharjan/cleaning-oscar-nepali-dataset).
> ### The dataset in the input is merged and taken from this [Notebook](https://www.kaggle.com/code/reganmaharjan/tokenizer-nepcov19tweets/notebook).
### Tokenizers are trained on this [Notebook](https://www.kaggle.com/code/reganmaharjan/nepali-tokenizers-4-transformers)

In [1]:
import os
import random
import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
import datasets
from transformers import set_seed

model_id = 'raygx/distilGPTBhai'

rand_seed = 9

def seed_everything(seed=0):
    random.seed(seed) # random
    os.environ['PYTHONHASHSEED'] = str(seed) # python enviroment
    np.random.seed(seed) # numpy
    tf.keras.utils.set_random_seed(seed) # tensorflow
    tf.random.set_seed(seed) # tensorflow
    set_seed(seed) # hugging_face transformer

seed_everything(rand_seed)



In [2]:
def pushToHub(thing,repo = None,token = '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'): 
    if not repo:
        raise(Exception("Repo name not provided"))
        
    thing_type = str(type(thing))
    if not ('datasets' in thing_type or 'models' in thing_type):
        raise(Exception("Either a Dataset or a Model can be pushed to hub.\nConfirm what you are trying to push!"))
    # login require python > 3.9 
    from huggingface_hub import login
    login(token)

    thing.push_to_hub(repo)

In [3]:
%%time

## load from input
data = datasets.load_from_disk('/kaggle/input/preparing-gpt-training-data/GPT_Training_Data')
## save to working directory - input is readonly
data.save_to_disk('training_data')

CPU times: user 965 ms, sys: 10.3 s, total: 11.3 s
Wall time: 1min 44s


In [4]:
## load data from working directory
data = datasets.load_from_disk('training_data')
data = datasets.concatenate_datasets([data['train'],data['test']])
data

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2903175
})

## Faced a bottleneck
> Datasize is too large hardly one single epoch was complete in 12hr session runtime provided by kaggle.<br>
> So, thinking of training on 1/16th the data size at a time.

*Note: Remember to change bn variable passed to select()*

*Note: The reason for the memory exhaustion was due to the batch size of training and validation set on model.fit()*


In [5]:
n_steps = 8

data_block_size = int(data.num_rows/n_steps)
a,b = 2,3  # run batch 4 # running all batch at once
chunk = range(data_block_size*a,data_block_size*b)#data['train'].num_rows)#

print("Chunking data",chunk,"Batch:",b,"out of",n_steps)
data.cleanup_cache_files()
data = data.select(chunk).shuffle(rand_seed).train_test_split(test_size=0.01)
gc.collect()
data

Chunking data range(725792, 1088688) Batch: 3 out of 8


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 359267
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3629
    })
})

In [None]:
from transformers import AutoTokenizer

context_length = 512

print("Loading Tokenizer")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
except:    
    tokenizer = AutoTokenizer.from_pretrained('raygx/GPT2_Nepali_Tokenizer')
    tokenizer.add_special_tokens({'pad_token': '<pad>','unk_token':'<unk>'})
    tokenizer.model_max_length = context_length

tokenizer

In [None]:
from transformers import DataCollatorForLanguageModeling

print("Initializing Data Collator")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False, 
                                                return_tensors="tf")

In [None]:
from transformers import TFAutoModelForCausalLM, AutoConfig

## To change the size of embedding - N_EMBED must me properly divisible by the size N_HEAD value
print("Initializing Model")
try:
    model = TFAutoModelForCausalLM.from_pretrained(model_id)
    print('Loading Pretrained')
except:    
    model = TFAutoModelForCausalLM.from_pretrained("raygx/Nepali-DistilGPT2",
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            pad_token_id=tokenizer.pad_token_id)
    print('Loading Previous Checkpoint')

model.resize_token_embeddings(len(tokenizer))
print(model.config)
model.summary()

In [None]:
print("Preparing Training and Testing sets to TRAIN the MODEL")
tf_train_set = model.prepare_tf_dataset(
    data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
from transformers import create_optimizer

num_train_steps = len(tf_train_set)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

## Training the Model

In [None]:
%%time

print("Training the model")
history = model.fit(x=tf_train_set, 
          validation_data=tf_test_set,
          verbose=2,
          epochs=1)

model.save_pretrained(model_id)
print(history.history)

In [None]:
# from seaborn import lineplot
# from matplotlib import pyplot as plt

# lineplot(history.history['loss'])
# lineplot(history.history['val_loss'])

# plt.plot()

In [None]:
import math

eval_loss = model.evaluate(tf_test_set)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

In [None]:
pushToHub(model,repo=model_id)
pushToHub(tokenizer,repo=model_id)

In [None]:
import shutil

shutil.rmtree('/kaggle/working/training_data')