# Hindi Text Generator Using GPT-2

## Team - 15 
* _Praneeth G_
* _Rishik TS_
* _Prashanti B_

# Downloading the Dataset

In [None]:
import tensorflow as tf
from gensim.corpora import WikiCorpus

#/content/drive/MyDrive/Colab Notebooks/Datafiles/GPT2_HINDI_TEXT_GENERATOR
# lang = 'bn'

def store(corpus, lang):
    base_path = "/content/drive/MyDrive/Colab Notebooks/Datafiles/GPT2_HINDI_TEXT_GENERATOR"
    store_path = os.path.join(base_path, '{}_corpus'.format(lang))
    if not os.path.exists(store_path):
        os.mkdir(store_path)
    file_idx=1
    for text in corpus.get_texts():
        current_file_path = os.path.join(store_path, 'article_{}.txt'.format(file_idx))
        with open(current_file_path, 'w' , encoding='utf-8') as file:
            file.write(bytes(' '.join(text), 'utf-8').decode('utf-8'))
        #endwith
        file_idx += 1
    #endfor

def tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list:
    return [token for token in text.split() if token_min_len <= len(token) <= token_max_len]

def run(lang):
    origin='https://dumps.wikimedia.org/{}wiki/latest/{}wiki-latest-pages-articles.xml.bz2'.format(lang,lang)
    fname='{}wiki-latest-pages-articles.xml.bz2'.format(lang)
    file_path = tf.keras.utils.get_file(origin=origin, fname=fname, untar=False, extract=False)
    corpus = WikiCorpus(file_path,  lower=False, tokenizer_func=tokenizer_func)
    store(corpus, lang)

In [None]:
run("hi")

# Loading the Data

In [None]:
% cd ./drive/MyDrive/Colab Notebooks/Datafiles/GPT2_HINDI_TEXT_GENERATOR

In [None]:
!ls

Installing the requirements

In [None]:
!pip3 install tokenizers

In [None]:
!pip3 install transformers

Getting the articles Files

In [None]:
from pathlib import Path
import os
# the folder 'text' contains all the files
paths = [str(x) for x in Path("./hi_corpus/").glob("**/*.txt")]
paths = paths[:300]

Storing the stored Data

In [None]:
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
   x = x.replace("'", "")
   x = x.replace("=", "")
  with open("./cleaned_hi_corpus/"+filename[10:],"w",encoding='utf-8') as f:
    f.write(x)
    f.close()

# Tokenising the Data Initialising the model

In [None]:
from tokenise import BPE_token
from pathlib import Path
import os
# the folder 'text' contains all the files
new_paths = [str(x) for x in Path("./cleaned_hi_corpus/").glob("**/*.txt")]
#print(paths)
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(new_paths)
# saving the tokenized data in our specified folder 
save_path = 'cleaned_tokenized_data'
tokenizer.save_tokenizer(save_path)

In [None]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

Combining the whole data to single string

In [None]:
single_string = ''
for filename in new_paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
   x = x.replace("'", "")
   x = x.replace("=", "")
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

Batching and making the dataset

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Compiling and Training the Model

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer,loss=[loss, *[None]* model.config.n_layer],metrics=[metric])

Fitting the dataset to the model

In [None]:
num_epoch = 25
#new_dataset = list(dataset.as_numpy_iterator())
history = model.fit(dataset, epochs=num_epoch)

In [None]:
model.summary()

In [None]:
model.save_pretrained("./Models/wiki_300_gpt2")

In [None]:
!ls ./Models/wiki_300_gpt2/

Plotting the evaluations

In [None]:
def plot(history):
    
    plt.plot(history.history['logits_accuracy'])
    #plt.plot(history.history['val_sparse_categorical_accuracy'])
    plt.title('model accuracy and loss')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train'], loc='upper left')
    plt.savefig("./Models/wiki_300_gpt2/"+'model_acc.png')
    plt.show()
    

    plt.plot(history.history['loss'])
    plt.plot(history.history['logits_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['loss', 'logits_loss'], loc='upper right')
    plt.savefig("./Models/wiki_300_gpt2/"+'model_loss.png')
    plt.show()

    plt.plot(history.history['logits_accuracy'])
    plt.plot(history.history['logits_loss'])
    plt.title('model evaluation')
    plt.ylabel('logits values')
    plt.xlabel('epochs')
    plt.legend(['logits_accuracy', 'logits_loss'], loc='upper right')
    plt.savefig("./Models/wiki_300_gpt2/"+'model_acc_loss.png')
    plt.show()

In [None]:
import matplotlib.pyplot as plt

plot(history)

# Generating the Text

In [None]:
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

In [None]:
def generate_text(text,model,max_length):
  # encoding the input text
  input_ids = tokenizer.encode(text, return_tensors='tf')
  # getting out output
  beam_output = model.generate(
    input_ids,
    max_length = max_length,
    num_beams = 8,
    temperature = 0.8,
    no_repeat_ngram_size=3,
    num_return_sequences=3
  )

  output_text = tokenizer.decode(beam_output[0])
  return output_text

output_dir = "cleaned_tokenized_data"
model_dir = "./Models/wiki_300_gpt2/"
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = TFGPT2LMHeadModel.from_pretrained(model_dir)

text = "आंध्रप्रदेश की संस्कृति"
print("Given text: ",text)

output_text = generate_text(text,model,60)
file1 = open("output.txt","w")#write mode
file1.write(output_text)
file1.close()
print("Generated text: ",output_text)