In [1]:
import numpy as np
import pandas as pd
import matplotlib
import os
from keras.layers import Input, BatchNormalization, Concatenate, Dense, GlobalAveragePooling2D, Dropout, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from keras import regularizers
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, History, TensorBoard
from keras.layers import Embedding, Flatten, Dense
from keras.utils import plot_model

Using TensorFlow backend.


In [2]:
D = 100 # embedding dimension
H = 300 # no.of hidden layer nodes

In [3]:
product_desc = np.load('train_desc.npy')
product_query = np.load('train_query.npy')
product_desc_test = np.load('test_desc.npy')
product_query_test = np.load('test_query.npy')
train_image_embeddings = np.load('train_embedding_image_preds.npy')
test_image_embeddings = np.load('test_embedding_image_preds.npy')
product_id_final_test = np.load('product_id_final_test.npy')

In [4]:
train_image_embeddings.shape

(5659, 100)

In [5]:
final_text = np.concatenate((product_desc, product_query,product_desc_test, product_query_test),axis=0)
final_text_query = np.concatenate((product_query, product_query_test), axis=0)
final_text_description = np.concatenate((product_desc,product_desc_test), axis=0)

In [6]:
import numpy as np
modelGlove = {}
with open("glove.6B.100d.txt", "r", encoding='utf-8') as lines:
        line = lines.readline()
        while line:
            modelGlove[line.split()[0]] = np.array(list(map(float, line.split()[1:])))
            line = lines.readline()

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 20000
tokenizer_desc = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer_desc.fit_on_texts(final_text_description)
sequence_desc = tokenizer_desc.texts_to_sequences(product_desc)
sequences_desc_test = tokenizer_desc.texts_to_sequences(product_desc_test)



In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 20000
tokenizer_query = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer_query.fit_on_texts(final_text_query)
sequences_query = tokenizer_query.texts_to_sequences(product_query)
sequences_query_test = tokenizer_query.texts_to_sequences(product_query_test)

In [9]:
word_index_desc = tokenizer_desc.word_index
print('Found %s unique tokens.' % len(word_index_desc))

Found 34222 unique tokens.


In [10]:
word_index_query = tokenizer_query.word_index
print('Found %s unique tokens.' % len(word_index_query))

Found 476 unique tokens.


In [11]:
data_d = pad_sequences(sequence_desc, maxlen=300)

In [12]:
data_d_test = pad_sequences(sequences_desc_test, maxlen=300)

In [13]:
data_q = pad_sequences(sequences_query, maxlen=20)

In [14]:
data_q_test = pad_sequences(sequences_query_test, maxlen=20)

In [15]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [16]:
embedding_matrix_desc = np.zeros((len(word_index_desc) + 1, 100))
for word, i in word_index_desc.items():
    embedding_vector = modelGlove.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_desc[i] = embedding_vector

In [17]:
embedding_matrix_query = np.zeros((len(word_index_query) + 1, 100))
for word, i in word_index_query.items():
    embedding_vector = modelGlove.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_query[i] = embedding_vector

In [18]:
from keras.layers import Embedding

embedding_layer_q = Embedding(len(word_index_query) + 1,
                            100,
                            weights=[embedding_matrix_query],
                            input_length=20,
                            trainable=False)
embedding_layer_d = Embedding(len(word_index_desc) + 1,
                            100,
                            weights=[embedding_matrix_desc],
                            input_length=300,
                            trainable=False)

In [19]:
D = 100 # embedding dimension
H = 300 # no.of hidden layer nodes

In [20]:
sequence_input_d = Input(shape=(300,), dtype='int32')
embeddings_d = embedding_layer_d(sequence_input_d)
sequence_input_q = Input(shape=(20,), dtype='int32')
embeddings_q = embedding_layer_q(sequence_input_q)
imageEmbeddingInput = Input(shape=(100,))
embedding_layer_d_flatten = Flatten()(embeddings_d) 
embedding_layer_q_flatten = Flatten()(embeddings_q)
x3 = Dense(H,activation='relu')(imageEmbeddingInput)
x1 = Dense(H,activation='relu')(embedding_layer_d_flatten)
x2 = Dense(H,activation='relu')(embedding_layer_q_flatten)
merge_x = concatenate([x1,x2,x3])
x4 = Dense(H,activation='relu')(merge_x)
x4 = Dense(H//4,activation='relu')(x4)
x4 = Dense(H//16,activation='relu')(x4)
#x4 = Dropout(0.5)(x4)
final_x = Dense(1)(x4)

In [21]:
model = Model(inputs=[sequence_input_d,sequence_input_q,imageEmbeddingInput ], outputs=final_x)

In [22]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 100)     3422300     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 100)      47700       input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1 

In [23]:
plot_model(model, to_file='model.png')

In [None]:
model.compile(loss=root_mean_squared_error,
              optimizer=Adam(lr=0.0001),
              metrics=['mse','mae'])

In [None]:
train_relevance = np.load('final_train_Y.npy')

In [None]:
test_relevance = np.load('final_test_Y.npy')

In [None]:
test_relevance.shape

(1544,)

In [None]:
train_relevance.shape

(5659,)

In [None]:
from sklearn.utils import shuffle
train_descriptor, train_query, train_image_embedding_model, train_relevance = shuffle(data_d, data_q, train_image_embeddings, train_relevance)

In [None]:
train_descriptor.shape, train_query.shape, train_relevance.shape, train_image_embedding_model.shape


((5659, 300), (5659, 20), (5659,), (5659, 100))

In [None]:
train_valid_split = 5000

In [None]:
valid_descriptor = train_descriptor[train_valid_split:]
valid_query = train_query[train_valid_split:]
valid_relevance = train_relevance[train_valid_split:]
valid_image = train_image_embedding_model[train_valid_split:]

In [None]:
valid_image.shape


(659, 100)

In [None]:
train_descriptor = train_descriptor[:train_valid_split]
train_query = train_query[:train_valid_split]
train_relevance = train_relevance[:train_valid_split]
train_image_embedding_model = train_image_embedding_model[:train_valid_split]

In [None]:
checkpointer = ModelCheckpoint(filepath='NN_best_desc+image2desc_1.hdf5', verbose=1, save_best_only=True, save_weights_only=True)
reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0)
earlystopping = EarlyStopping(monitor='val_loss', patience=20)
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
model.fit([train_descriptor,train_query, train_image_embedding_model],train_relevance, epochs=100, verbose=2,
          validation_data=([valid_descriptor, valid_query, valid_image], valid_relevance), batch_size=32,
                    callbacks=[checkpointer, earlystopping])

Train on 5000 samples, validate on 659 samples
Epoch 1/100
 - 5s - loss: 0.9719 - mean_squared_error: 1.0556 - mean_absolute_error: 0.8057 - val_loss: 0.8513 - val_mean_squared_error: 0.7311 - val_mean_absolute_error: 0.7094

Epoch 00001: val_loss improved from inf to 0.85132, saving model to NN_best_desc+image2desc_1.hdf5
Epoch 2/100
 - 4s - loss: 0.6644 - mean_squared_error: 0.4508 - mean_absolute_error: 0.5290 - val_loss: 0.8214 - val_mean_squared_error: 0.6846 - val_mean_absolute_error: 0.6571

Epoch 00002: val_loss improved from 0.85132 to 0.82141, saving model to NN_best_desc+image2desc_1.hdf5
Epoch 3/100
 - 4s - loss: 0.5112 - mean_squared_error: 0.2685 - mean_absolute_error: 0.3967 - val_loss: 0.7984 - val_mean_squared_error: 0.6471 - val_mean_absolute_error: 0.6317

Epoch 00003: val_loss improved from 0.82141 to 0.79835, saving model to NN_best_desc+image2desc_1.hdf5
Epoch 4/100
 - 4s - loss: 0.4014 - mean_squared_error: 0.1662 - mean_absolute_error: 0.3064 - val_loss: 0.8131 

 - 4s - loss: 0.1534 - mean_squared_error: 0.0265 - mean_absolute_error: 0.1030 - val_loss: 0.8023 - val_mean_squared_error: 0.6550 - val_mean_absolute_error: 0.6079

Epoch 00035: val_loss did not improve from 0.78482
Epoch 36/100
 - 4s - loss: 0.1530 - mean_squared_error: 0.0262 - mean_absolute_error: 0.1020 - val_loss: 0.8065 - val_mean_squared_error: 0.6598 - val_mean_absolute_error: 0.6260

Epoch 00036: val_loss did not improve from 0.78482
Epoch 37/100
 - 4s - loss: 0.1507 - mean_squared_error: 0.0252 - mean_absolute_error: 0.0989 - val_loss: 0.7945 - val_mean_squared_error: 0.6425 - val_mean_absolute_error: 0.6087

Epoch 00037: val_loss did not improve from 0.78482
Epoch 38/100
 - 4s - loss: 0.1520 - mean_squared_error: 0.0256 - mean_absolute_error: 0.1008 - val_loss: 0.7935 - val_mean_squared_error: 0.6410 - val_mean_absolute_error: 0.6097

Epoch 00038: val_loss did not improve from 0.78482
Epoch 39/100
 - 4s - loss: 0.1511 - mean_squared_error: 0.0255 - mean_absolute_error: 0.1

In [None]:
model.load_weights('NN_best_desc+image2desc_1.hdf5')

In [None]:

model.evaluate([data_d_test,data_q_test,test_image_embeddings],test_relevance)

In [None]:
predictedResilts = model.predict([data_d_test,data_q_test,test_image_embeddings])
resDumpRawImage = []
for i in range(len(predictedResilts)):
    resDumpRawImage.append((product_query_test[i],product_id_final_test[i],test_relevance[i], np.asscalar(predictedResilts[i])))
np.save('resFinal', resDumpRawImage)