# Sports Chatbot Model

## Data from a MySQL Database is trained using LSTM model

In [7]:
from numpy import array
import numpy as np
import random
import sys
import io
import os
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.layers import LSTM
from keras.layers import Embedding
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.callbacks import LambdaCallback
import pickle

def save_obj(obj, name ):
    with open('objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [3]:
file_name = '../Data/baseball30_to_40.csv'
file = open(file_name)
data=file.read()

print('original corpus length:',len(data))

original corpus length: 1206226


In [8]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

#Save dictionary these will be needed for deployment
save_obj(tokenizer.word_index, 'word_dictionary')
save_obj(tokenizer,'word_tokenizer')


# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]

# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X, y, epochs=1, verbose=True)


# Save the model for deployment
model.save('keras_text_model.h5')

Vocabulary Size: 18186
Total Sequences: 226672
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 10)             181860    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 18186)             927486    
Total params: 1,121,546
Trainable params: 1,121,546
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1


In [10]:
words = [k for k, v in tokenizer.word_index.items()]

def decode_sequence(sequence):
    return [words[seq - 1] for seq in sequence]

in_text = input('Enter a word to translate')

encoded = tokenizer.texts_to_sequences([in_text])
encoded = array(encoded)

# y_hat == index of predicted word next in sequence
yhat = model.predict_classes(encoded, verbose=0)
#yhat is a numpy array
# Lookup sequences and print them out
print('output:',decode_sequence(yhat))

Enter a word to translateBaseball
output: ['is']


In [33]:

# evaluate
in_text = "Jack"
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])
encoded = array(encoded)

# y_hat == index of predicted word next in sequence
yhat = model.predict_classes(encoded, verbose=0)
#yhat is a numpy array
# Lookup sequences and print them out
print('output:',decode_sequence(yhat))

Jack
<class 'numpy.ndarray'>


['is']