In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/MLOM Labs/lab 5'

/content/drive/MyDrive/MLOM Labs/lab 5


In [3]:
#import libraries
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [9]:
# Load ascii text and covert to lowercase

filename = "data.txt"
raw_text = open(filename , 'r' , encoding='utf-8').read()
raw_text = raw_text.lower()

In [10]:
raw_text[0:100]

"project gutenberg's alice's adventures in wonderland, by lewis carroll\n\nthis ebook is for the use of"

In [11]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))  #sorts the unique chars in order
print(chars)

char_to_int = dict((c , i) for i,c in enumerate(chars))  #chars mapped to int, i and c is a pair, c -> charcater and i -> index

['\n', ' ', '!', '"', '#', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [12]:
# Getting the details of the dataset
n_chars = len(raw_text)  #Total no. of charcaters
n_vocab = len(chars)     # total no. of unique charcaters

print("Total characters: ", n_chars)
print("Total Vocab(Unique characters): ", n_vocab)

Total characters:  163780
Total Vocab(Unique characters):  58


In [13]:
#prepare the dataset of input to output pairs encoded as integers
seq_length = 15
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[ i + seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])

n_patterns = len(dataY)
print("Total Patterns: " , n_patterns)

Total Patterns:  163765


In [14]:
print(dataX[1])
print(dataY[1])

[49, 46, 41, 36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49]
38


In [15]:
#Transform the list of input sequences into the form [samples, time steps, features] that is expected by an
#LSTM network and rescale the integers to the range [0,1] to make the patterns easier to learn by the LSTM
#network that uses the sigmoid activation function by default.
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
 # normalize - rescaling the integer values
X = X / float (n_vocab)
print(X)

[[[0.81034483]
  [0.84482759]
  [0.79310345]
  ...
  [0.77586207]
  [0.56896552]
  [0.62068966]]

 [[0.84482759]
  [0.79310345]
  [0.70689655]
  ...
  [0.56896552]
  [0.62068966]
  [0.84482759]]

 [[0.79310345]
  [0.70689655]
  [0.62068966]
  ...
  [0.62068966]
  [0.84482759]
  [0.65517241]]

 ...

 [[0.55172414]
  [0.56896552]
  [0.79310345]
  ...
  [0.79310345]
  [0.79310345]
  [0.72413793]]

 [[0.56896552]
  [0.79310345]
  [0.89655172]
  ...
  [0.79310345]
  [0.72413793]
  [0.86206897]]

 [[0.79310345]
  [0.89655172]
  [0.87931034]
  ...
  [0.72413793]
  [0.86206897]
  [0.22413793]]]


In [20]:
#Convert the output values (single characters converted to integers) into a one hot encoding.
#one hot encode the output variable
y = np_utils.to_categorical(dataY)
#print(y)

In [21]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
#It can have 1 or more training samples
model.add(Dropout (0.2))
model.add(Dense (y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam")
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint (filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [22]:
# Change the hyperparameter values and train model
epochs = 10
batch_size = 128
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

Epoch 1/10
Epoch 1: loss improved from inf to 2.96595, saving model to weights-improvement-01-2.9659.hdf5
Epoch 2/10
Epoch 2: loss improved from 2.96595 to 2.79203, saving model to weights-improvement-02-2.7920.hdf5
Epoch 3/10
Epoch 3: loss improved from 2.79203 to 2.71513, saving model to weights-improvement-03-2.7151.hdf5
Epoch 4/10
Epoch 4: loss improved from 2.71513 to 2.66121, saving model to weights-improvement-04-2.6612.hdf5
Epoch 5/10
Epoch 5: loss improved from 2.66121 to 2.61506, saving model to weights-improvement-05-2.6151.hdf5
Epoch 6/10
Epoch 6: loss improved from 2.61506 to 2.56786, saving model to weights-improvement-06-2.5679.hdf5
Epoch 7/10
Epoch 7: loss improved from 2.56786 to 2.52080, saving model to weights-improvement-07-2.5208.hdf5
Epoch 8/10
Epoch 8: loss improved from 2.52080 to 2.47810, saving model to weights-improvement-08-2.4781.hdf5
Epoch 9/10
Epoch 9: loss improved from 2.47810 to 2.43562, saving model to weights-improvement-09-2.4356.hdf5
Epoch 10/10
Ep

<keras.callbacks.History at 0x7a3e3b282410>

In [24]:
# load the network weights
filename = "weights-improvement-10-2.3975.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [25]:
int_to_char = dict((i,c) for i, c in enumerate(chars))
# generate a random seed
print(len(dataX))
start = numpy.random.randint(0, len(dataX)-1)
print(start)
pattern = dataX[start]
 #datax contains list of patterns
print("Seed: ")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

163765
44879
Seed: 
"  under its feet "


In [27]:
# generate characters
length = 10
final = []
for i in range(length):
  # reshaping the seed sequence before passing it into the LSTM model
  X = numpy.reshape(pattern, (1, len(pattern), 1))
  #print(x)
  # normalizing the integer values
  X = X/ float(n_vocab)
# print(x)
  # making prediction
  prediction = model.predict(X, verbose=0)
  # Get the predicted value with maximum probability
  index= numpy.argmax(prediction)
  # Convert the predicted integer to char
  result = int_to_char[index]
  #print(result)
  final.append(result)
  # Adding the predicted character to the sequence sequence
  pattern.append(index)
  # Removing the first character from the seed sequence
  pattern = pattern[1:len(pattern)]
print(final)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
