<a href="https://colab.research.google.com/github/PrathamKumar125/NLP-Text-Generator/blob/master/NLP_RNN2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN Text Generator

In [3]:
%tensorflow_version 2.x
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [6]:
from PyPDF2 import PdfReader

def pdf_to_text(pdf_file, txt_file):
    text = ''
    with open(pdf_file, 'rb') as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()

    with open(txt_file, 'w', encoding='utf-8') as f:
        f.write(text)

# Provide the file paths
pdf_file = "/content/drive/MyDrive/NLP/Constitution.pdf"
txt_file = '/content/drive/MyDrive/NLP/Constitution.txt'

# Convert PDF to text
pdf_to_text(pdf_file, txt_file)


In [7]:
path_to_file="/content/drive/MyDrive/NLP/Constitution.txt"

In [8]:
text=open(path_to_file,'rb').read().decode(encoding='utf-8')
print("length of text: {} characters".format(len(text)))

length of text: 876554 characters


In [9]:
print(text[:250])

 
 
 
 
 
 THE CONSTITUTION OF INDIA 
[As on       May , 2022] 
2022 
  
 
PREFACE 
 
This is the  fifth  pocket size edition of the Constitution of 
India in the diglot form. In this edition, the text of the 
Constitution of India has been brought u


In [10]:
vocab=sorted(set(text))

char2indx={u:i for i,u in enumerate(vocab)}
indx2char=np.array(vocab)

def text_to_int(text):
  return np.array([char2indx[c] for c in text])

text_as_int=text_to_int(text)

In [11]:
print("Text:",text[:13])
print("Encoded:",text_to_int(text[:13]))

Text:  
 
 
 
 
 TH
Encoded: [ 1  0  1  0  1  0  1  0  1  0  1 43 31]


In [12]:
def int_to_text(ints):
  try:
    ints=ints.numpy()
  except:
    pass
  return ''.join(indx2char[ints])

print(int_to_text(text_as_int[:13]))

 
 
 
 
 
 TH


In [13]:
seq_length=100
examples_per_epoch=len(text)//(seq_length+1)
char_dataset=tf.data.Dataset.from_tensor_slices(text_as_int)

In [14]:
sequences=char_dataset.batch(seq_length+1,drop_remainder=True)

In [15]:
def split_input_target(chunk):
  input_text=chunk[:-1]
  target_text=chunk[1:]
  return input_text,target_text

dataset=sequences.map(split_input_target)

In [16]:
for x,y in dataset.take(2):
  print("\n\nExample\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOutput")
  print(int_to_text(y))



Example

INPUT
 
 
 
 
 
 THE CONSTITUTION OF INDIA 
[As on       May , 2022] 
2022 
  
 
PREFACE 
 
This is the  f

Output

 
 
 
 
 THE CONSTITUTION OF INDIA 
[As on       May , 2022] 
2022 
  
 
PREFACE 
 
This is the  fi


Example

INPUT
fth  pocket size edition of the Constitution of 
India in the diglot form. In this edition, the text

Output
th  pocket size edition of the Constitution of 
India in the diglot form. In this edition, the text 


In [17]:
 BATCH_SIZE=128
 VOCAB_SIZE=len(vocab)
 EMBEDDING_DIM=256
 RNN_UNITS=1024
 BUFFER_SIZE=10000
 data=dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)

In [18]:
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
  model=tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape=[batch_size,None]),
      tf.keras.layers.LSTM(rnn_units,return_sequences=True,
                           stateful=True,
                           recurrent_initializer="glorot_uniform"),
      tf.keras.layers.Dense(vocab_size)
      ])
  return model

model=build_model(VOCAB_SIZE,EMBEDDING_DIM,RNN_UNITS,BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (128, None, 256)          23040     
                                                                 
 lstm (LSTM)                 (128, None, 1024)         5246976   
                                                                 
 dense (Dense)               (128, None, 90)           92250     
                                                                 
Total params: 5362266 (20.46 MB)
Trainable params: 5362266 (20.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
def loss(labels,logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

In [20]:
model.compile(optimizer="adam",loss=loss)

In [21]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_prefix, save_weights_only=True)

In [22]:
history=model.fit(data,epochs=100,callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [23]:
model=build_model(VOCAB_SIZE,EMBEDDING_DIM,RNN_UNITS,batch_size=1)

In [24]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))

In [25]:
def generate_text(model,start_string):
  num_generate=500
  input_eval=[char2indx[s] for s in start_string]
  input_eval=tf.expand_dims(input_eval,0)

  text_generated=[]

  temperature=1.0
  model.reset_states()

  for i in range(num_generate):
    predictions=model(input_eval)

    predictions=tf.squeeze(predictions,0)
    predictions=predictions/temperature
    predicted_id=tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

    input_eval=tf.expand_dims([predicted_id],0)

    text_generated.append(indx2char[predicted_id])

  return (start_string + ''.join(text_generated))


In [26]:
input_txt=input("Type starting string:\n")
print(generate_text(model,input_txt))

Type starting string:
Constitution of India
Constitution of India as by 
law established, 1[that I will uphold the sovereignty and inthirgures havendation of the autonomous State unless the Governor in the interests of 
the said twenty-
sixthenty Assembly, 
suct paragraph, and by 
reason the working of members of the Legislative Council of 
a State shall have the rissolvement and Deulthe 
Government of any State or under any other law for the time being in force. 
73. The Vice-President  to be elected under sub-clauses ( a), (b) and (c) of 
clause (3) of arti
