In [1]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Change directory
import os
os.chdir("drive/My Drive/Machine Learning/Datasets")
# Print out the current directory
!pwd

Mounted at /content/drive
/content/drive/My Drive/Machine Learning/Datasets


In [2]:
# libraries used
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
import pickle
from google.colab import files


In [3]:
# get text file to be used
uploaded = files.upload()

Saving Final Data.txt to Final Data (1).txt


In [4]:
file = open("Final Data.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
  lines.append(i)

# convert list to string
data = ""
for i in lines:
  data = ' '.join(lines)

# replace unecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('"', '') # new line, carraige return, unicode character, open/close quotes -> whitespace

# remove unecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'Dragon Ball Z is an iconic anime series that continues the adventures of Son Goku, a Saiyan warrior, after the events of the original Dragon Ball series. Set five years after the conclusion of Dragon Ball, Goku has settled down with his family but is soon pulled back into the world of martial arts and intergalactic conflicts. The series primarily revolves around Goku and his friends defending Earth from powerful adversaries who threaten its safety. These foes often possess immense strength and a'

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

# create token for each unique word
sequence_data = tokenizer.texts_to_sequences([data]) [0]
sequence_data[:15]

[6, 7, 12, 13, 41, 18, 19, 9, 14, 42, 1, 43, 3, 20, 5]

In [6]:
# size of the vocab (how many unique words exist)
vocab_size = len(tokenizer.word_index) + 1
print (vocab_size)

178


In [7]:
sequences = []

for i in range (3, len(sequence_data)):
  # first 3 words are input, 4th word is output
  words = sequence_data[i - 3 : i + 1]
  sequences.append(words)

print ("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  292


array([[ 6,  7, 12, 13],
       [ 7, 12, 13, 41],
       [12, 13, 41, 18],
       [13, 41, 18, 19],
       [41, 18, 19,  9],
       [18, 19,  9, 14],
       [19,  9, 14, 42],
       [ 9, 14, 42,  1],
       [14, 42,  1, 43],
       [42,  1, 43,  3]])

In [8]:
# Split the data into proper input/output
X = []
y = []

for i in sequences:
  X.append(i[0:3])
  y.append(i[3])

X = np.array(X)
y = np.array(y)

In [9]:
y = to_categorical(y, num_classes = vocab_size)
y[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,

In [10]:
# creating the model to be trained
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = 3))
model.add(LSTM(1000, return_sequences = True))
model.add(LSTM(1000))
model.add(Dense(1000, activation = 'relu'))
model.add(Dense(vocab_size, activation = 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             1780      
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 178)               178178    
                                                                 
Total params: 13228958 (50.46 MB)
Trainable params: 13228958 (50.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# build the model (aka training)

checkpoint = ModelCheckpoint('next_words.h5', monitor = 'loss', verbose = 1, save_best_only = True)
model.compile(loss = 'categorical_crossentropy', optimizer = Adam(learning_rate = 0.001))
model.fit(X, y, epochs = 70, batch_size = 64, callbacks = [checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 5.17946, saving model to next_words.h5


  saving_api.save_model(


Epoch 2/70
Epoch 2: loss improved from 5.17946 to 5.12754, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 5.12754 to 4.92904, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 4.92904 to 4.84512, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 4.84512 to 4.80004, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 4.80004 to 4.76339, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 4.76339 to 4.74136, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 4.74136 to 4.66504, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 4.66504 to 4.58428, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.58428 to 4.47477, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 4.47477 to 4.37260, saving model to next_words.h5
Epoch 12/70
Epoch 12: loss improved from 4.37260 to 4.22837, saving model to next_words.h5
Epoch 13/70
Epo

<keras.src.callbacks.History at 0x7992165f3c40>

In [12]:
# load the model and the tokenizer for prediction
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Word(model, tokenizer, text): # "BEHOLD, MY PREDICTIONS"
  sequence = tokenizer .texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ''

  for key, value in tokenizer.word_index.items():
    if value == preds:
      predicted_word = key
      break

  print (predicted_word)
  return predicted_word

In [None]:
# take input from user
while(True):
  text = input ('Enter your line: ')

  if text == '0': # Acts as Program killswitch, type 0 into text back to exit program
    print ('Execution completed...')
    break

  else: # get user input and predict the next word from data
    try:
      text = text.split(' ')
      text = text[-3:]
      print (text)

      Predict_Next_Word(model, tokenizer, text)

    except Exception as e:
      print ('Error occured: ', e)
      continue

['the', 'Namekian', 'warrior']
piccolo
['to', 'the', 'tyrannical']
reign
