In [1]:
from numpy import array
import numpy as np
import random
import sys
import io
import os
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.layers import LSTM
from keras.layers import Embedding
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.callbacks import LambdaCallback
import pickle



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
file_name = '../Data/baseball30_to_40.csv'
file = open(file_name)
data=file.read()

print('original corpus length:',len(data))

original corpus length: 1206226


In [None]:
!pwd


In [3]:

# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]


In [13]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

#Save dictionary 
save_obj(tokenizer.word_index, 'word_dictionary')
save_obj(tokenizer,'word_tokenizer')

Vocabulary Size: 18186


In [14]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 226672


In [15]:
# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]


In [None]:
#df = pd.DataFrame(encoded)
print(type(encoded))
print(type(sequences))
print(sequences.shape)

In [16]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

In [27]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             181860    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 18186)             927486    
Total params: 1,121,546
Trainable params: 1,121,546
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
"""# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

"""

In [28]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=1, verbose=2)

Epoch 1/1
 - 446s - loss: 7.3596 - acc: 0.0362


<keras.callbacks.History at 0x11a3a91d0>

In [None]:
model.save('keras_text_model.h5')

In [36]:
words = [k for k, v in tokenizer.word_index.items()]

def decode_sequence(sequence):
    return [words[seq - 1] for seq in sequence]

In [31]:
print(type(tokenizer.word_index.items()))

<class 'dict_items'>


In [33]:
# evaluate
in_text = "Jack"
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])
encoded = array(encoded)

# y_hat == index of predicted word next in sequence
yhat = model.predict_classes(encoded, verbose=0)
#yhat is a numpy array
# Lookup sequences and print them out
decode_sequence(yhat)

Jack
<class 'numpy.ndarray'>


['is']

In [56]:
def chat_response(in_message):
    words = load_words()
    tnizer = load_tokenizer()
    #tnizer.fit_on_texts([in_message])
    encoded_message = tnizer.texts_to_sequences([in_message])
    print('Encoded Message',encoded_message)
    encoded_message = array(encoded_message)[0] #only take the first word for now
    #load model
    saved_model = load_model('keras_text_model.h5')

    #call a model.predict
    pred = saved_model.predict_classes(encoded_message,verbose=False)
    #retval = decode_outgoing(words)
    retval = decode_sequence(pred)
    return retval

In [55]:
chat_response('The first game of the season')


Encoded Message [[1, 134, 44, 11, 1, 168]]


['game', 'the', 'the', 'the', 'game', 'the']

In [39]:
def decode_outgoing(word_dictionary,in_sequence):
    decode_list = []
    for seq in word_dictionary:
        decode_list.append(seq)
    return decode_list

In [8]:
def load_words():
    word_dict = load_obj('word_dictionary')
    ret_obj = [k for k, v in word_dict.items()]
    return ret_obj

In [9]:
def load_tokenizer():
    return load_obj('word_tokenizer')

In [10]:
def save_obj(obj, name ):
    with open('objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [11]:
def load_obj(name ):
    with open('objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
    
    in_text, result = seed_text, seed_text
    
    # generate a fixed number of words
    for _ in range(n_words):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded)
        
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
        	if index == yhat:
        		out_word = word
        		break
                
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

### line-entity sequences

In [None]:
# create line-based sequences
sequences = list()
for line in data.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

In [None]:
sequences

In [None]:
from keras.preprocessing.sequence import pad_sequences

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

In [None]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X, y, epochs=500, verbose=2)

In [None]:
words = [k for k, v in tokenizer.word_index.items()]

def decode_sequence(sequence):
    return [words[seq - 1] for seq in sequence]

In [None]:
# evaluate
in_text = "Tom Brady"
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])
encoded = array(encoded)

# y_hat == index of predicted word next in sequence
yhat = model.predict_classes(encoded, verbose=0)

# Lookup sequences and print them out
decode_sequence(yhat)