In [1]:
import numpy as np
import pandas as pd
import matplotlib
import os
from keras.layers import Input, BatchNormalization, Concatenate, Dense, GlobalAveragePooling2D, Dropout, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from keras import regularizers
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, History, TensorBoard
from keras.layers import Embedding, Flatten, Dense
from keras.utils import plot_model

Using TensorFlow backend.


In [2]:
D = 100 # embedding dimension
H = 300 # no.of hidden layer nodes

In [42]:
product_desc = np.load('train_desc.npy')
product_query = np.load('train_query.npy')
product_desc_test = np.load('test_desc.npy')
product_query_test = np.load('test_query.npy')
train_image_embeddings = np.load('train_embedding_image_preds.npy')
test_image_embeddings = np.load('test_embedding_image_preds.npy')
product_id_final_test = np.load('product_id_final_test.npy')

In [4]:
train_image_embeddings.shape

(5659, 100)

In [5]:
final_text = np.concatenate((product_desc, product_query,product_desc_test, product_query_test),axis=0)
final_text_query = np.concatenate((product_query, product_query_test), axis=0)
final_text_description = np.concatenate((product_desc,product_desc_test), axis=0)


In [6]:
import numpy as np
modelGlove = {}
with open("glove.6B.100d.txt", "r", encoding='utf-8') as lines:
        line = lines.readline()
        while line:
            modelGlove[line.split()[0]] = np.array(list(map(float, line.split()[1:])))
            line = lines.readline()

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 20000
tokenizer_desc = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer_desc.fit_on_texts(final_text_description)
sequence_desc = tokenizer_desc.texts_to_sequences(product_desc)
sequences_desc_test = tokenizer_desc.texts_to_sequences(product_desc_test)



In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 20000
tokenizer_query = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer_query.fit_on_texts(final_text_query)
sequences_query = tokenizer_query.texts_to_sequences(product_query)
sequences_query_test = tokenizer_query.texts_to_sequences(product_query_test)

In [9]:
word_index_desc = tokenizer_desc.word_index
print('Found %s unique tokens.' % len(word_index_desc))

Found 34222 unique tokens.


In [10]:
word_index_query = tokenizer_query.word_index
print('Found %s unique tokens.' % len(word_index_query))

Found 476 unique tokens.


In [11]:
data_d = pad_sequences(sequence_desc, maxlen=300)

In [12]:
data_d_test = pad_sequences(sequences_desc_test, maxlen=300)

In [13]:
data_q = pad_sequences(sequences_query, maxlen=20)

In [14]:
data_q_test = pad_sequences(sequences_query_test, maxlen=20)

In [15]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [16]:
embedding_matrix_desc = np.zeros((len(word_index_desc) + 1, 100))
for word, i in word_index_desc.items():
    embedding_vector = modelGlove.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_desc[i] = embedding_vector

In [17]:
embedding_matrix_query = np.zeros((len(word_index_query) + 1, 100))
for word, i in word_index_query.items():
    embedding_vector = modelGlove.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_query[i] = embedding_vector

In [18]:
from keras.layers import Embedding

embedding_layer_q = Embedding(len(word_index_query) + 1,
                            100,
                            weights=[embedding_matrix_query],
                            input_length=20,
                            trainable=False)
embedding_layer_d = Embedding(len(word_index_desc) + 1,
                            100,
                            weights=[embedding_matrix_desc],
                            input_length=300,
                            trainable=False)

In [19]:
D = 100 # embedding dimension
H = 300 # no.of hidden layer nodes

In [20]:
sequence_input_d = Input(shape=(300,), dtype='int32')
embeddings_d = embedding_layer_d(sequence_input_d)
sequence_input_q = Input(shape=(20,), dtype='int32')
embeddings_q = embedding_layer_q(sequence_input_q)
# imageEmbeddingInput = Input(shape=(100,))
embedding_layer_d_flatten = Flatten()(embeddings_d) 
embedding_layer_q_flatten = Flatten()(embeddings_q)
# x3 = Dense(H,activation='relu')(imageEmbeddingInput)
x1 = Dense(H,activation='relu')(embedding_layer_d_flatten)
x2 = Dense(H,activation='relu')(embedding_layer_q_flatten)
merge_x = concatenate([x1,x2])
x4 = Dense(H,activation='relu')(merge_x)
x4 = Dense(H//4,activation='relu')(x4)
x4 = Dense(H//16,activation='relu')(x4)
#x4 = Dropout(0.5)(x4)
final_x = Dense(1)(x4)

In [21]:
model = Model(inputs=[sequence_input_d,sequence_input_q], outputs=final_x)

In [22]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 100)     3422300     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 100)      47700       input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1 

In [23]:
plot_model(model, to_file='model.png')

In [24]:
model.compile(loss=root_mean_squared_error,
              optimizer=Adam(lr=0.0001),
              metrics=['mse','mae'])

In [25]:
train_relevance = np.load('final_train_Y.npy')

In [26]:
test_relevance = np.load('final_test_Y.npy')

In [27]:
test_relevance.shape

(1544,)

In [28]:
train_relevance.shape

(5659,)

In [29]:
from sklearn.utils import shuffle
train_descriptor, train_query, train_image_embedding_model, train_relevance = shuffle(data_d, data_q, train_image_embeddings, train_relevance)

In [30]:
train_descriptor.shape, train_query.shape, train_relevance.shape, train_image_embedding_model.shape

((5659, 300), (5659, 20), (5659,), (5659, 100))

In [31]:
train_valid_split = 5000

In [32]:
valid_descriptor = train_descriptor[train_valid_split:]
valid_query = train_query[train_valid_split:]
valid_relevance = train_relevance[train_valid_split:]
valid_image = train_image_embedding_model[train_valid_split:]

In [33]:
valid_image.shape

(659, 100)

In [34]:
train_descriptor = train_descriptor[:train_valid_split]
train_query = train_query[:train_valid_split]
train_relevance = train_relevance[:train_valid_split]
train_image_embedding_model = train_image_embedding_model[:train_valid_split]

In [35]:
checkpointer = ModelCheckpoint(filepath='NN_best_only_text_1.hdf5', verbose=1, save_best_only=True, save_weights_only=True)
reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0)
earlystopping = EarlyStopping(monitor='val_loss', patience=20)
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [36]:
model.fit([train_descriptor,train_query],train_relevance, epochs=100, verbose=2,
          validation_data=([valid_descriptor, valid_query], valid_relevance), batch_size=32,
                    callbacks=[checkpointer, earlystopping])

Train on 5000 samples, validate on 659 samples
Epoch 1/100
 - 6s - loss: 1.0521 - mean_squared_error: 1.2511 - mean_absolute_error: 0.8700 - val_loss: 0.8639 - val_mean_squared_error: 0.7558 - val_mean_absolute_error: 0.7231

Epoch 00001: val_loss improved from inf to 0.86390, saving model to NN_best_only_text_1.hdf5
Epoch 2/100
 - 4s - loss: 0.6922 - mean_squared_error: 0.4931 - mean_absolute_error: 0.5480 - val_loss: 0.7795 - val_mean_squared_error: 0.6176 - val_mean_absolute_error: 0.6271

Epoch 00002: val_loss improved from 0.86390 to 0.77951, saving model to NN_best_only_text_1.hdf5
Epoch 3/100
 - 4s - loss: 0.5278 - mean_squared_error: 0.2853 - mean_absolute_error: 0.4122 - val_loss: 0.7883 - val_mean_squared_error: 0.6296 - val_mean_absolute_error: 0.6440

Epoch 00003: val_loss did not improve from 0.77951
Epoch 4/100
 - 4s - loss: 0.4257 - mean_squared_error: 0.1884 - mean_absolute_error: 0.3232 - val_loss: 0.7813 - val_mean_squared_error: 0.6208 - val_mean_absolute_error: 0.62


Epoch 00035: val_loss did not improve from 0.76637
Epoch 36/100
 - 4s - loss: 0.1609 - mean_squared_error: 0.0293 - mean_absolute_error: 0.1068 - val_loss: 0.7721 - val_mean_squared_error: 0.6051 - val_mean_absolute_error: 0.6085

Epoch 00036: val_loss did not improve from 0.76637
Epoch 37/100
 - 4s - loss: 0.1626 - mean_squared_error: 0.0301 - mean_absolute_error: 0.1085 - val_loss: 0.7789 - val_mean_squared_error: 0.6155 - val_mean_absolute_error: 0.6129

Epoch 00037: val_loss did not improve from 0.76637
Epoch 38/100
 - 4s - loss: 0.1618 - mean_squared_error: 0.0297 - mean_absolute_error: 0.1080 - val_loss: 0.7709 - val_mean_squared_error: 0.6034 - val_mean_absolute_error: 0.6033

Epoch 00038: val_loss did not improve from 0.76637
Epoch 39/100
 - 4s - loss: 0.1585 - mean_squared_error: 0.0282 - mean_absolute_error: 0.1067 - val_loss: 0.7649 - val_mean_squared_error: 0.5946 - val_mean_absolute_error: 0.5946

Epoch 00039: val_loss improved from 0.76637 to 0.76487, saving model to NN_


Epoch 00070: val_loss did not improve from 0.75824
Epoch 71/100
 - 4s - loss: 0.1405 - mean_squared_error: 0.0229 - mean_absolute_error: 0.0876 - val_loss: 0.7653 - val_mean_squared_error: 0.5952 - val_mean_absolute_error: 0.5953

Epoch 00071: val_loss did not improve from 0.75824
Epoch 72/100
 - 4s - loss: 0.1369 - mean_squared_error: 0.0219 - mean_absolute_error: 0.0854 - val_loss: 0.7688 - val_mean_squared_error: 0.6005 - val_mean_absolute_error: 0.6012

Epoch 00072: val_loss did not improve from 0.75824
Epoch 73/100
 - 4s - loss: 0.1385 - mean_squared_error: 0.0224 - mean_absolute_error: 0.0867 - val_loss: 0.7671 - val_mean_squared_error: 0.5975 - val_mean_absolute_error: 0.6047

Epoch 00073: val_loss did not improve from 0.75824
Epoch 74/100
 - 4s - loss: 0.1378 - mean_squared_error: 0.0221 - mean_absolute_error: 0.0884 - val_loss: 0.7662 - val_mean_squared_error: 0.5970 - val_mean_absolute_error: 0.5982

Epoch 00074: val_loss did not improve from 0.75824
Epoch 75/100
 - 4s - los

<keras.callbacks.History at 0x7fdc806870b8>

In [37]:
model.load_weights('NN_best_only_text_1.hdf5')

In [38]:
model.evaluate([data_d_test,data_q_test],test_relevance)



[0.766140341758728, 0.6071317177362393, 0.6018003716369985]

In [43]:
predictedResilts = model.predict([data_d_test,data_q_test])
resDumpRawImage = []
for i in range(len(predictedResilts)):
    resDumpRawImage.append((product_query_test[i],product_id_final_test[i],test_relevance[i], np.asscalar(predictedResilts[i])))
np.save('resDumpTextOnly', resDumpRawImage)