<a href="https://colab.research.google.com/github/SheikLaisha/Text-generation/blob/main/text/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# loading the data
file = open("the_adventures_of_sherlock_holmes_1661-0.txt").read()

In [3]:
# tokenization
# standardization
# Tokenization - It is the process of breaking a stream of text up into word phrases symbols or other meaningful elements 
def tokenize_words(input):
  #lowercase everything to standardize it
  input = input.lower()
  # instantiating the tokenizer
  tokenizer = RegexpTokenizer(r'\w+')
  # tokenizing the text into tokens
  tokens = tokenizer.tokenize(input)
  # filtering the stopwords using lambda
  filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
  return "".join(filtered)
  
# preprocess the input data , make tokens  
processed_inputs = tokenize_words(file)

In [4]:
# chars to numbers
# convert characters in our input to numbers
# sort the list of the set of all characters that appear in our i/p text and then use the enumerate fn to get numbers that represent the characters
# then create a dictionary that stores the key and values, or the characters and numbers that represent them
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [5]:
# check if words to chars or chars to num (?!) has worked?
# just so that we get an idea of wheather our process of converting words to characters has worked
# print the length of variables 
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 281313
Total vocab: 44


In [6]:
# sequence length
# defining how long we want an individual sequence here
# an individual sequence is a completing mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [7]:
# loop through the sequence
# here,we are going through the entire list of inputs and converting the chars to numbers with for loop
# this will create a bunch of sequences where each sequence starts with the next character in the input data 
# beginning with the first character
for i in range(0, input_len - seq_length, 1):
  # define input and output sequences
  # input is the current character plus desired sequence length
  in_seq = processed_inputs[i:i + seq_length]
  # out sequence is the initial character plus total sequence length
  out_seq = processed_inputs[i + seq_length]
  # converting the list of character to integers based on previous values and appending the values to our list
  x_data.append([char_to_num[char] for char in in_seq])
  y_data.append(char_to_num[out_seq])

# check to see how many total input sequence we have 
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 281213


In [8]:
# convert input sequence to np array and so that our network can use
x = numpy.reshape(x_data, (n_patterns, seq_length, 1))
x = x/float(vocab_len)

In [9]:
# one-hot encoding our label data
y = np_utils.to_categorical(y_data)

In [10]:
# creating the model 
# creating a sequential model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [11]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
# saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [13]:
# fit model and let it train
model.fit(x,y, epochs=4, batch_size=256, callbacks=desired_callbacks) 

Epoch 1/4

Epoch 00001: loss improved from inf to 2.96044, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.96044 to 2.94245, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.94245 to 2.92204, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.92204 to 2.87187, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f862734f510>

In [14]:
# recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam') 

In [15]:
# output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [16]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" eservedwarmestthankscouldexplaintruestateaffairswithoutbetrayingonecertainlydeservedlittleenoughcons "


In [17]:
# generate the test
for i in range(100):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1:len(pattern)]

erererererererererererererererererererererererererererererererererererererererererererererererererer