File Source: [Kaggle Notebook](https://www.kaggle.com/code/reganmaharjan/preparing-gpt-training-data)

In [None]:
import os
import numpy as np # linear algebra,
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv),
import tensorflow as tf

import gc

### Loading Dataset

In [None]:
%%time

import datasets #huggingface datasets

print("Loading Dataset")
data = datasets.load_dataset("raygx/Nepali-Extended-Text-Corpus")
data

In [None]:
##### BAG of words computation
from tqdm.auto import tqdm

bag_of_words = set()

for i in tqdm(range(0,data['train'].num_rows,50000)):
    j = i+100000
    j = j if j<data['train'].num_rows else data['train'].num_rows

    bag_of_words = set(list(bag_of_words)+(" ".join(data['train'].select(range(i,j))['text'])).split())


len(bag_of_words)  ## 4966875 words in the bag

### Loading Tokenizer

In [None]:
from transformers import AutoTokenizer

context_length = 512

print("Loading Tokenizer")

tokenizer = AutoTokenizer.from_pretrained('raygx/GPT2_Nepali_Tokenizer')
tokenizer.add_special_tokens({'pad_token': '<pad>','unk_token':'<unk>'})
tokenizer.model_max_length = context_length

tokenizer

In [None]:
%%time

def preprocess_function(rows):
    concatenated_rows = tokenizer(tokenizer.bos_token.join(rows['text']))

    total_length = len(concatenated_rows[list(concatenated_rows.keys())[0]])
    splits = int(total_length/context_length)    
    
    result = {
        k: np.array_split(t[:splits*context_length],splits)
        for k, t in concatenated_rows.items()
    }
    
    if total_length > splits*context_length:
        for k, t in concatenated_rows.items():
            result[k].append(concatenated_rows[k][-(context_length):])
    
    return result

num_proc = os.cpu_count()

print("Tokenizing the data")
lm_data = data.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        remove_columns=data["train"].column_names,
        batch_size=5000
    )

print(lm_data)
lm_data.save_to_disk('GPT_Training_Data')