In [13]:
from numpy.matrixlib.defmatrix import matrix
import spacy
import os
import re
from tensorflow.python.keras.backend import categorical_crossentropy
from keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.layers.core import Activation
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
import math
import numpy as np
from pickle import dump, load

In [14]:
nlp = spacy.load('en')
file = open('/home/tanmay/Desktop/NLP Workflow/Working with a big data/collected_data.txt','r')
minidata = file.readline()
club = [text.lower() for text in re.findall('[A-Za-z]+',minidata)]

## Using the Tokenizer

In [15]:
tk = Tokenizer(num_words=len(club), lower= True)
tk.fit_on_texts(club)
sequence = tk.texts_to_sequences(club)

## Now making the 2D matrix

In [16]:
training_length =math.trunc(math.sqrt(len(sequence)))

new_Matrix = []
for i in range(training_length,len(sequence)):
    row = sequence[i-training_length:i]
    new_Matrix.append(row)

## Making the Matrix into a numpy array and also seperating the test case with the result set

### Premodelling the data

In [17]:
Matrix = np.array(new_Matrix)
vocabulary_size =len(club)
X=Matrix[:,:-1]
y =Matrix[:,-1]
y = to_categorical(y, num_classes= vocabulary_size +1)
seq_len = X.shape[1]

## Creating a function whose perpose is to create our LSTM Sequential model and also compile that model.

In [18]:
def create_model(vocab_Size, seq_Len):
    model = Sequential()
    model.add(Embedding(vocab_Size,seq_Len,input_length=seq_Len))
    model.add(LSTM(seq_Len*2,return_sequences=True))
    model.add(LSTM(seq_Len*2))
    model.add(Dense(seq_Len*2,activation='relu'))
    model.add(Dense(vocab_Size,activation = 'softmax'))

    model.compile(loss ='categorical_crossentropy', optimizer='adam', metrics= ['accuracy'])
    print("Summary :-")
    print(model.summary())
    return model

## Creating the model, training it, saving it and dumping the train model into our  initialized tokenizer 

In [19]:
model = create_model(vocabulary_size+ 1, seq_len)
model.fit(X,y,batch_size = 128, verbose=2, epochs=3)

model.save('Training sample.h5')
dump(tk,open('Training sample.h5','wb'))

Summary :-
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 7)              490       
_________________________________________________________________
lstm_2 (LSTM)                (None, 7, 14)             1232      
_________________________________________________________________
lstm_3 (LSTM)                (None, 14)                1624      
_________________________________________________________________
dense_2 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 70)                1050      
Total params: 4,606
Trainable params: 4,606
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
1/1 - 2s - loss: 4.2486 - accuracy: 0.0164
Epoch 2/3
1/1 - 0s - loss: 4.2476 - accuracy: 0.0

## Defining the function that generate the text

### Characterstics : 
- We have just train the tokenized
- We already have knowledge about the vocabulary
- This is robust to have shorter seen text of longer the seen text than the seuqence length
- Seen text must be equal to the the text of model that has been used to tained on. Otherwise we have to pad it.


In [20]:
def generate_text(model,tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range (num_gen_words):
        encoded_text = tokenizer.text_to_sequences([input_text])[0]
        pad_encoding = pad_sequences([encoded_text], max_len =seq_len,truncating='pre')
        #Here the max length is whatever the sequence length that was passed in this
        '''
            Moreover: This essentially makes sure that if you passed in a long seen text.
            Thus we have taken very small set to train upon thus we have to pad it to make 
            sure that it has only small number of tokens or if the seen text happens to be too short we are going to pad it to fill uo that allocated place

            In order to get better result, it is recommended just passing a seen text thatt is actually the same expected length thta model has.
        '''
        pred_word = model.predict_model(pad_encoding, verbose = 2)
        input_text +=" "+ pred_word
    return ' '.join(output_text)

# Using the function and then testing the data