In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

In [3]:
save_path = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/oscar_gpt2/Tokenizer'
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

0

In [4]:
# creating the configurations from which the model can be made
config = GPT2Config(
    vocab_size = tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [5]:
model = TFGPT2LMHeadModel(config)

In [6]:
oscar_data_path = "/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Oscar_nepali_dataset"

In [7]:
from pathlib import Path 
from tqdm.auto import tqdm
paths = [str(x) for x in Path(oscar_data_path).glob("*.txt")]

In [8]:
paths

['/content/drive/MyDrive/Colab Notebooks/NLP/Language Modelling/Datasets/Oscar_nepali_dataset/ne_dedup.txt']

In [9]:
print(tokenizer.encode("अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले "))
print(len(tokenizer.encode("अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले ")))

[2420, 231, 227, 230, 229, 225, 229, 232, 254, 226, 236, 237, 236, 232, 227, 226, 229, 2745, 238, 1355, 354, 226, 265, 238, 236, 225, 275, 226, 236, 225, 229, 238, 240, 226, 236, 225, 234, 237, 23571, 231, 186]
41


In [10]:
single_string = ''
for filename in tqdm(paths):
  with open(filename, "r", encoding='utf-8') as f:
    x = f.read()
    single_string += x + tokenizer.eos_token

  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
len(single_string)

463664657

In [None]:
string_tokenized = tokenizer.encode(single_string)

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 16
BUFFER_SIZE = 1000

for i in tqdm(range(0, len(string_tokenized) - block_size + 1, block_size)):
  examples.append(string_tokenized[i:i + block_size])

In [None]:
inputs, labels = [], []
for ex in tqdm(examples):
  inputs.append(ex[:-1])
  labels.append(ex[1:])

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Model Training

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 2
history = model.fit(dataset, epochs=num_epoch)

In [None]:
import os
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/oscar_gpt2/transformer_model'
# creating directory if it is not present
if not os.path.exists(output_dir):
  os.mkdir(output_dir)
  model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)
# save tokenizer
tokenizer.save_pretrained(output_dir)

# Manual Evaluation

In [None]:
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
output_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/Transformer Models/oscar_gpt2/transformer_model'
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = TFGPT2LMHeadModel.from_pretrained(output_dir)

In [None]:
def generateSequences(text, tokenizer, model):
  text = text.strip()
  text = text + " "
  # encoding the input text
  input_ids = tokenizer.encode(text, return_tensors='tf')
  # getting out output
  beam_output = model.generate(
    input_ids,
    max_length = 50,
    num_beams = 5,
    temperature = 0.7,
    no_repeat_ngram_size=2,
    num_return_sequences=5
  )

  for i in range(len(beam_output)):
    print(tokenizer.decode(beam_output[i]))
    print("")

In [None]:
text = "अमेरिकाको न्युयोर्क सहरकी ३२ वर्षीया ज्याकी स्यामुअलले "
generateSequences(text = text, tokenizer = tokenizer, model = model)

In [None]:
text = "उज्यालो कोठामा सुत्ने "
generateSequences(text = text, tokenizer = tokenizer, model = model)