In [None]:
folder_path = "/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik"

In [None]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from tokenizers import ByteLevelBPETokenizer


class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(files = paths, trainer = trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [None]:
from pathlib import Path 
paths = [str(x) for x in Path(folder_path).glob("**/*.txt")]
paths[0:5]

['/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/45632.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/46333.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/47044.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/46178.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/46682.txt']

In [None]:
tokenizer = BPE_token()# train the tokenizer model
tokenizer.bpe_train(paths)

In [None]:
# saving the tokenized data in our specified folder 
save_path = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2'
tokenizer.save_tokenizer(save_path)

# Model Initialization

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 6.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 60.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 63.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

In [None]:
# loading tokenizer from the save model path
save_path = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2'
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

0

In [None]:
# creating the configurations from which the model can be made
config = GPT2Config(
    vocab_size = tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
model = TFGPT2LMHeadModel(config)

In [None]:
from tqdm.auto import tqdm

In [None]:
folder_path = "/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik"

In [None]:
from pathlib import Path 
paths = [str(x) for x in Path(folder_path).glob("**/*.txt")]
paths[0:5]

['/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/45632.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/46333.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/47044.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/46178.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Nepali_Corpus/Nagarik/politics/46682.txt']

In [None]:
single_string = ''
for filename in tqdm(paths):
  with open(filename, "r", encoding='utf-8') as f:
    x = f.read()
    single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

  0%|          | 0/4481 [00:00<?, ?it/s]

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 16
BUFFER_SIZE = 1000

for i in tqdm(range(0, len(string_tokenized) - block_size + 1, block_size)):
  examples.append(string_tokenized[i:i + block_size])

  0%|          | 0/85790 [00:00<?, ?it/s]

In [None]:
len(examples), print(examples[0])

[167, 170, 172, 159, 168, 158, 282, 490, 192, 170, 172, 159, 168, 158, 1855, 159, 204, 162, 168, 195, 162, 206, 159, 584, 159, 306, 162, 176, 159, 198, 200, 171, 160, 159, 1910, 159, 710, 192, 170, 163, 159, 168, 162, 161, 158, 554, 158, 528, 158, 167, 171, 1112, 159, 168, 158, 188, 158, 238, 171, 169, 174, 159, 368, 158, 410, 159, 168, 159, 160, 171, 692, 164, 161, 159, 176, 159, 160, 192, 165, 176, 171, 166, 164, 193, 159, 161, 164, 188, 159, 779, 165, 450, 158, 160, 171, 166, 158, 214, 200, 171]


(85790, None)

In [None]:
inputs, labels = [], []
for ex in tqdm(examples):
  inputs.append(ex[:-1])
  labels.append(ex[1:])

  0%|          | 0/85790 [00:00<?, ?it/s]

In [None]:
print(inputs[0])
print(labels[0])

[167, 170, 172, 159, 168, 158, 282, 490, 192, 170, 172, 159, 168, 158, 1855, 159, 204, 162, 168, 195, 162, 206, 159, 584, 159, 306, 162, 176, 159, 198, 200, 171, 160, 159, 1910, 159, 710, 192, 170, 163, 159, 168, 162, 161, 158, 554, 158, 528, 158, 167, 171, 1112, 159, 168, 158, 188, 158, 238, 171, 169, 174, 159, 368, 158, 410, 159, 168, 159, 160, 171, 692, 164, 161, 159, 176, 159, 160, 192, 165, 176, 171, 166, 164, 193, 159, 161, 164, 188, 159, 779, 165, 450, 158, 160, 171, 166, 158, 214, 200]
[170, 172, 159, 168, 158, 282, 490, 192, 170, 172, 159, 168, 158, 1855, 159, 204, 162, 168, 195, 162, 206, 159, 584, 159, 306, 162, 176, 159, 198, 200, 171, 160, 159, 1910, 159, 710, 192, 170, 163, 159, 168, 162, 161, 158, 554, 158, 528, 158, 167, 171, 1112, 159, 168, 158, 188, 158, 238, 171, 169, 174, 159, 368, 158, 410, 159, 168, 159, 160, 171, 692, 164, 161, 159, 176, 159, 160, 192, 165, 176, 171, 166, 164, 193, 159, 161, 164, 188, 159, 779, 165, 450, 158, 160, 171, 166, 158, 214, 200, 171]


In [None]:
print(inputs[1])
print(labels[1])

[178, 159, 160, 264, 158, 417, 165, 199, 179, 114, 153, 121, 167, 165, 176, 171, 163, 165, 280, 158, 160, 159, 167, 162, 163, 188, 159, 1361, 158, 214, 583, 264, 158, 234, 173, 170, 226, 159, 197, 326, 173, 159, 178, 158, 352, 184, 244, 159, 169, 195, 159, 339, 159, 204, 158, 393, 171, 169, 450, 158, 160, 171, 200, 171, 178, 159, 160, 174, 158, 160, 162, 203, 165, 191, 165, 205, 188, 159, 779, 165, 450, 158, 160, 171, 163, 158, 212, 158, 182, 162, 190, 170, 848, 158, 160, 362, 165, 188, 158]
[159, 160, 264, 158, 417, 165, 199, 179, 114, 153, 121, 167, 165, 176, 171, 163, 165, 280, 158, 160, 159, 167, 162, 163, 188, 159, 1361, 158, 214, 583, 264, 158, 234, 173, 170, 226, 159, 197, 326, 173, 159, 178, 158, 352, 184, 244, 159, 169, 195, 159, 339, 159, 204, 158, 393, 171, 169, 450, 158, 160, 171, 200, 171, 178, 159, 160, 174, 158, 160, 162, 203, 165, 191, 165, 205, 188, 159, 779, 165, 450, 158, 160, 171, 163, 158, 212, 158, 182, 162, 190, 170, 848, 158, 160, 362, 165, 188, 158, 238]


In [None]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Model Training

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 2
history = model.fit(dataset, epochs=num_epoch)

Epoch 1/2
Epoch 2/2


# Save the model

In [None]:
import os

In [None]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model'
# creating directory if it is not present
if not os.path.exists(output_dir):
  os.mkdir(output_dir)
  model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)
# save tokenizer
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model/added_tokens.json')

# Loading and testing the model

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 29.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [5]:
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

In [6]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model'

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = TFGPT2LMHeadModel.from_pretrained(output_dir)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/Text Generation with GPT-2/Saved_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [37]:
def generateSequences(text, tokenizer, model):
  text = text.strip()
  text = text + " "
  # encoding the input text
  input_ids = tokenizer.encode(text, return_tensors='tf')
  # getting out output
  beam_output = model.generate(
    input_ids,
    max_length = 50,
    num_beams = 5,
    temperature = 0.7,
    no_repeat_ngram_size=2,
    num_return_sequences=5
  )

  for i in range(len(beam_output)):
    print(tokenizer.decode(beam_output[i]))
    print("")

In [38]:
text = "अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले "
generateSequences(text = text, tokenizer = tokenizer, model = model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले  सुन तस्करी

अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले  प्रहरी कार

अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले  स्थानीयल

अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले  स्थानीयक

अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले  प्रहरीलाई



In [41]:
text = "उज्यालो कोठामा सुत्ने "
generateSequences(text = text, tokenizer = tokenizer, model = model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


उज्यालो कोठामा सुत्ने  जिल्ला प्रहरी कार्यक्रमका लागि स्थानीय

उज्यालो कोठामा सुत्ने  जिल्लाका प्रहरी कार्यक्रम सञ्चालन गरिएको छ

उज्यालो कोठामा सुत्ने  जिल्लाका प्रहरी कार्यक्रम सञ्चालन गरेको छ

उज्यालो कोठामा सुत्ने  जिल्ला प्रहरी कार्यक्रमका लागि स्थानीयल

उज्यालो कोठामा सुत्ने  जिल्ला प्रहरी कार्यक्रमका लागि स्थानीयक

