In [1]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential, load_model, Model
from keras.layers import LSTM, Dense, Dropout, Activation, Embedding, TimeDistributed, concatenate, Input
from keras.layers.wrappers import Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
import word2vec_utils as w2v
import data
import numpy as np
import re
import random
from data_utils import split_dataset 
from chat_constants import *
from keras.utils.np_utils import to_categorical



Using Theano backend.


In [2]:
w2v_model = w2v.initialize()

In [3]:
def null_punisher(y_true, y_pred ):
    '''Provides stronger incentive to avoid Null'''
    L = 3
    error = keras.losses.cosine_proximity( y_true,y_pred)
    if y_true[-1] != BLANK[-1] or y_pred[-1] == BLANK[-1]:
        error *= L
    return error

def get_sets_of_data(size=5000):
    A1, B, A2 = data.load_data()
    All_data = np.array([A1,B,A2])
    num_sections = len(A1)//size
    sets = []
    for i in range(num_sections):
        sets.append(All_data[:,i*size:(i+1)*size])
    return(sets)

In [4]:
LSTM_DROPOUT = 0.15
A1, B, A2 = data.load_data()


In [54]:
A1_input = Input(shape=(MAX_SENT_LENGTH,EMBED_DIM))

A1_layer1 = Bidirectional( LSTM(EMBED_DIM,name = "A1_layer1", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT) )
A1_layer2 = LSTM(EMBED_DIM,name = "A1_layer2", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)
A1_net = A1_layer2(A1_layer1(A1_input))

B_input = Input(shape=(MAX_SENT_LENGTH,EMBED_DIM))

B_layer1 = Bidirectional( LSTM(EMBED_DIM,name = "B_layer1", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT) )
B_layer2 = LSTM(EMBED_DIM,name = "B_layer2", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)

B_net = B_layer2(B_layer1(B_input))

combined = concatenate([A1_net,B_net])
A2_pred1 = LSTM(EMBED_DIM, name = "A2_layer1", return_sequences = True, dropout = LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)

A2_net = A2_pred1(combined)

chat_model = Model( inputs = [ A1_input,B_input ], outputs = [ A2_net ] )
chat_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 30, 101)       0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 30, 101)       0                                            
____________________________________________________________________________________________________
bidirectional_11 (Bidirectional) (None, 30, 202)       164024                                       
____________________________________________________________________________________________________
bidirectional_12 (Bidirectional) (None, 30, 202)       164024                                       
___________________________________________________________________________________________

In [56]:
checkpointer = ModelCheckpoint( 'chat_net.h5',verbose = 1,monitor = 'val_acc',save_best_only = True)
early_stopping = EarlyStopping( monitor = 'val_acc',patience = 2)

In [61]:
sets = get_sets_of_data(size = 8000)

In [62]:

adam = keras.optimizers.Adam(lr = 0.025)
chat_model.compile( optimizer=adam,loss = null_punisher,metrics = ['accuracy'])
chat_model.load_weights('chat_net.h5',by_name = True)
for a1,b,a2 in sets:
    A1_train,B_train,A2_train = w2v.get_training_data(a1,b,a2)
    chat_model.fit([A1_train,B_train], A2_train,
          batch_size=200, epochs=5,
          validation_split = 0.05,
          callbacks = [checkpointer,early_stopping] )

Train on 7600 samples, validate on 400 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 7600 samples, validate on 400 samples
Epoch 1/5

KeyboardInterrupt: 

'1'

In [134]:
word_freqs = np.load('words_in_order_of_freq.npy')[:VOCAB_SIZE]
embeded_words = np.array(map( lambda x: np.append(w2v_model[x],0), word_freqs ) )
categorical_words = np.array( [ to_categorical(x,num_classes = VOCAB_SIZE + 3)[0] for x in range(len(word_freqs)) ] )
w2v_to_onehot = Sequential()
w2v_to_onehot.add(Dense(VOCAB_SIZE+3, input_shape=(EMBED_DIM,),activation = 'softmax',name='w2v_to_onehot_mapper'))


In [139]:
adam = keras.optimizers.Adam(lr = 0.025)#default 0.001
w2v_to_onehot.compile( optimizer=adam,loss = 'categorical_crossentropy',metrics = ['categorical_accuracy'])

In [141]:
w2v_to_onehot.load_weights('w2v_to_onehot.h5',by_name=True)
w2v_to_onehot.fit([embeded_words],categorical_words,batch_size = 200, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7efe5a5d63d0>

In [144]:
i = 78
word=w2v_model[word_freqs[i]]
word = np.append(word,0)

word_vec = w2v_to_onehot.predict(np.array([word]))
print( word_freqs[i])
print(word_freqs[np.where(word_vec==max(word_vec))[0][0]])

going
.


In [138]:
w2v_to_onehot.save('w2v_to_onehot.h5')

In [None]:
def just_model_un_vectorize( predicted ):
    ret =[]
    for word_vec in predicted:
        if abs(word_vec[-1] - 1) < 0.001:
            ret.append("_")
            continue
        word_vec = word_vec[:-1]
        x = w2v_model.similar_by_vector( word_vec, topn = 3 )
        print( x )
        w2v_word,w2v_similarity = x[0]
        print( w2v_word, " ", w2v_similarity )
        ret.append(w2v_word)
    return( " ".join(ret) )
         

2 8 1 @ I'll slip in , talk them into to come out , and you'll be free to blow holy high heaven the whole lot of them .


In [None]:
w2v.unvectorize_initialize()

In [None]:
predicted = chat_model.predict([A1_test,B_test])

In [None]:
predicted_words = just_model_un_vectorize( predicted[0] )
print( predicted_words )

In [None]:
w2v_model.most_similar('')

In [13]:
BLANK.shape

(301,)

In [12]:
a = {}
a['__BLANK__'] = BLANK
a['__BLANK__']

import pickle
with open('unknown_words_stored.pkl', 'w+') as f:
    pickle.dump(a, f)

In [None]:
print( A1[30000], B[30000],A2[30000] )

In [None]:
w2v_model.similar_by_word("man")

In [None]:
len(A1)

In [None]:
A2.shape

In [None]:
AB_Input = Input(shape =(MAX_SENT_LENGTH,EMBED_DIM))

AB_layer1 = LSTM(EMBED_DIM,return_sequences=True, name = "AB_layer1", dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)
AB_layer2 = LSTM(EMBED_DIM,return_sequences=True, name = "AB_layer2", dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)
AB_layer3 = LSTM(EMBED_DIM,return_sequences=True, name = "AB_layer3", dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)

AB_output = AB_layer3(AB_layer2(AB_layer1(AB_Input)))

chat_model = Model(inputs = [ AB_Input], outputs = [AB_output])
chat_model.compile(loss=null_punisher,
              optimizer='Adam',
              metrics=['accuracy']
              )
early_stopping = EarlyStopping( monitor = 'val_acc',patience = 2)
chat_model.load_weights('chat_net.h5',by_name=True)
chat_model.fit([A1B_train], A2_train,
          batch_size=200, epochs=20,
          validation_split = 0.025 )
chat_model.save('chat_net.h5')

In [None]:
my_input = Input(shape=(MAX_SENT_LENGTH,EMBED_DIM))

layer1 = Bidirectional(LSTM(EMBED_DIM,name = "layer1", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT))
layer2 = Bidirectional(LSTM(EMBED_DIM,name = "layer2", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT))
layer3 = LSTM(EMBED_DIM,name = "layer3", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)

my_output = layer3(layer2(layer1(my_input)))

simpl_chat_model = Model(inputs = [ my_input], outputs = [my_output])
simpl_chat_model.summary()

simpl_checkpointer = ModelCheckpoint( 'simpl_chat_net.h5',verbose = 1,monitor = 'val_acc',save_best_only = True)
simpl_early_stopping = EarlyStopping( monitor = 'val_acc',patience = 2)


In [None]:
adam = keras.optimizers.Adam(lr = 0.02)
simpl_chat_model.compile( optimizer=adam,loss = null_punisher,metrics = ['accuracy'])
#simpl_chat_model.load_weights('simpl_chat_net.h5',by_name = True)

In [None]:
for a1,b,a2 in sets:
    A1_train,B_train,A2_train = w2v.get_training_data(a1,b,a2)
    simpl_chat_model.fit([A1_train], B_train,
          batch_size=200, epochs=1,
          validation_split = 0.05,
          callbacks = [simpl_checkpointer,simpl_early_stopping] )
    simpl_chat_model.fit([B_train], A2_train,
          batch_size=200, epochs=1,
          validation_split = 0.05,
          callbacks = [simpl_checkpointer,simpl_early_stopping] )
    #chat_model.save('chat_net.h5')

In [None]:
predit = simpl_chat_model.predict([B_test])

In [None]:
predicted_words = just_model_un_vectorize( predit[0] )
print( predicted_words )

In [5]:
############### One Hot ####################
A1_input = Input(shape=(MAX_SENT_LENGTH,EMBED_DIM))

A1_layer1 = Bidirectional( LSTM(EMBED_DIM,name = "A1_layer1", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT) )
A1_layer2 = LSTM(EMBED_DIM*2,name = "A1_layer2", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)
A1_net = A1_layer2(A1_layer1(A1_input))

B_input = Input(shape=(MAX_SENT_LENGTH,EMBED_DIM))

B_layer1 = Bidirectional( LSTM(EMBED_DIM,name = "B_layer1", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT) )
B_layer2 = LSTM(EMBED_DIM*2,name = "B_layer2", return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)

B_net = B_layer2(B_layer1(B_input))

combined = concatenate([A1_net,B_net])
A2_pred1 = LSTM(EMBED_DIM*4, name = "A2_layer1", return_sequences = True, dropout = LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT)
A2_pred2 = TimeDistributed(Dense(VOCAB_SIZE + 3, name = "A2_layer2", activation = 'softmax' ) )

A2_net =A2_pred2(A2_pred1(combined))

one_hot_chat_model = Model(inputs = [ A1_input,B_input ], outputs = [ A2_net ])
one_hot_chat_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 30, 301)       0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 30, 301)       0                                            
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 30, 602)       1452024                                      
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 30, 602)       1452024                                      
___________________________________________________________________________________________

In [6]:
checkpointer = ModelCheckpoint( 'one_hot_chat_net300dim.h5',verbose = 1,monitor = 'val_categorical_accuracy',save_best_only = True)
early_stopping = EarlyStopping( monitor = 'val_categorical_accuracy',patience = 2)


In [8]:
sets = get_sets_of_data(size = 1000)

In [6]:
x = w2v.one_hot_vectorize("fish")

Loaded word_frequencies data from disk


In [7]:
x.shape


(1,)

In [10]:

adam = keras.optimizers.Adam(lr = 0.025)#default 0.001
one_hot_chat_model.compile( optimizer=adam,loss = 'categorical_crossentropy',metrics = ['categorical_accuracy'])

In [None]:

#one_hot_chat_model.load_weights('one_hot_chat_net_300dim.h5',by_name = True)
i = 0
for a1,b,a2 in sets:
    A1_train,B_train,A2_train = w2v.get_training_data_one_hot_out(a1,b,a2)
   
    cat_a2 = []
    for sent in A2_train:
        cat_a2.append(to_categorical(sent,num_classes = VOCAB_SIZE + 3).astype('int8'))
    cat_a2 = np.array(cat_a2)
    one_hot_chat_model.fit([A1_train,B_train], cat_a2,
          batch_size=50, epochs=5,
          validation_split = 0.05,
          callbacks = [] )#checkpointer,early_stopping] )
    one_hot_chat_model.save('one_hot_chat_net300dim.h5')

Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 950 samples, validate on 50 samples
Epoch

KeyboardInterrupt: 

In [11]:
def one_hot_vectorize( sentence, pad_length = -1, word_freqs = None ):
    if word_freqs is None:
        word_freqs = np.load('words_in_order_of_freq.npy')
    
    sentence = split_sentence( sentence )
    words = sentence.split(" ")
    vectorized_sentence = []
    
    for word in words:
        lower_word = word.lower()
        number = word_freqs.index(lower_word)
        if number > VOCAB_SIZE:
            number = UNK
        vectorized_sentence.append( number )

    if( pad_length != -1 ):
        while( len(vectorized_sentence) < pad_length ):
            vectorized_sentence.append(NULL)
    
    return np.array(vectorized_sentence)

In [40]:
to_categorical(np.array([UNK]),num_classes = VOCAB_SIZE + 2 )[

array([ 0.,  0.,  0., ...,  0.,  0.,  1.])

In [20]:
to_categorical(A2_train[0]).shape

(30, 10003)

In [17]:
A2_train_cat.shape

(3000, 10003)

In [21]:
A2_train.shape

(100, 30)

In [9]:
word_freqs = np.load('words_in_order_of_freq.npy') 

def get_random_test_sentence():
    index = random.randint(0,len(A1)-1)
    print A1[index]
    print B[index]
    print A2[index]
    return( w2v.get_training_data_one_hot_out(A1[index:index+1],B[index:index+1],A2[index:index+1]))

In [44]:
A1_test,B_test,A2_test = get_random_test_sentence()

2 8 1 @ ME ? ! No , uh , I'm just a transvestite .
1 9 1 @ Isn't that the same thing ?
2 8 1 @ No , no ! I like girls . So how 'bout Friday ?


In [52]:
predict = one_hot_chat_model.predict([A1_test,B_test])

In [53]:

pred_words = []
for word_vec in predict[0]:
    #print(word_vec )
    pred_words.append( word_freqs[np.where(word_vec==max(word_vec))[0][0]] )
print( " ".join(pred_words))

prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes prizes poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom poolroom
