In [1]:
import numpy as np
import pandas as pd
import matplotlib
import os
from keras.layers import Input, BatchNormalization, Concatenate, Dense, GlobalAveragePooling2D, Dropout, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from keras import regularizers
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, History, TensorBoard
from numpy import array, argmax
from pickle import load, dump

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_descriptions(filename):
        # load all descriptions
    dataset = load(open(filename, 'rb'))
    if not dataset:
        print("Fialed to load descriptions")
        sys.exit(1)
    return dataset

# load photo features


def load_photo_features(filename, images):
        # load all features
    all_features = load(open(filename, 'rb'))
    # filter features
    features = {k: all_features[str(k)] for k in images}
    return features

In [3]:
train_descriptions = load_descriptions('data/train_descriptions.pkl')
train_features = load_photo_features(
    'data/image_data.pkl', train_descriptions.keys())
print('Photos: train=%d' % len(train_features))

Photos: train=5659


In [4]:
test_descriptions = load_descriptions('data/test_descriptions.pkl')
test_features = load_photo_features(
    'data/image_data.pkl', test_descriptions.keys())
print('Photos: test=%d' % len(test_features))

Photos: test=1544


In [5]:
train_descriptor = np.load('train_descr_100.npy')
test_descriptor = np.load('test_descr_100.npy')

In [63]:
train_product_id = np.load('new_train_product_id.npy')
test_product_id = np.load('new_test_product_id.npy')

In [72]:
train_product_id[0], len(train_features.keys())

(711162629, 5659)

In [62]:
train_product_id.shape, test_product_id.shape, len(train_features), len(test_features)

((5659,), (1544,), 5659, 1544)

In [73]:
train_image_list = []
for i in train_product_id:
    if i in train_features.keys():
        train_image_list.append(train_features[i])
    elif i in test_features.keys():
        train_image_list.append(test_features[i])
    else:
        print('i')
train_image_list = np.array(train_image_list)

In [74]:
len(train_image_list)

5659

In [75]:
test_image_list = []
for i in test_product_id:
    if i in train_features.keys():
        test_image_list.append(train_features[i])
    elif i in test_features.keys():
        test_image_list.append(test_features[i])
    else:
        print('i')
test_image_list = np.array(test_image_list)  

In [77]:
test_image_list.shape

(1544, 1000)

In [7]:
train_product_map = np.load('product_id_final_train.npy')
test_product_map = np.load('product_id_final_test.npy')

In [8]:
train_image_data = []
for i in train_product_map:
    train_image_data.append(train_features[i])
train_image_data = np.array(train_image_data)

In [9]:
test_image_data = []
for i in test_product_map:
    test_image_data.append(test_features[i])
test_image_data = np.array(test_image_data)

In [10]:
train_image_data.shape, test_image_data.shape

((5659, 1000), (1544, 1000))

In [84]:
np.save('train_image_features',train_image_data)
np.save('test_image_features',test_image_data)

In [11]:
train_descriptor.shape, test_descriptor.shape

((5659, 100), (1544, 100))

In [12]:
D = 1000 # embedding dimension
H = 300 # no.of hidden layer nodes

In [13]:
inp1 = Input(shape=( D,))
x1 = Dense(H,activation='relu')(inp1)

x4 = Dense(200,activation='relu')(x1)
x4 = Dense(150,activation='relu')(x4)
x4 = Dense(125,activation='relu')(x4)
#x4 = Dropout(0.5)(x4)
final_x = Dense(100)(x4)

In [14]:
model = Model(inputs=inp1, outputs=final_x)

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               300300    
_________________________________________________________________
dense_2 (Dense)              (None, 200)               60200     
_________________________________________________________________
dense_3 (Dense)              (None, 150)               30150     
_________________________________________________________________
dense_4 (Dense)              (None, 125)               18875     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               12600     
Total params: 422,125
Trainable params: 422,125
Non-trainable params: 0
_________________________________________________________________


In [16]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [17]:
model.compile(loss=root_mean_squared_error,
              optimizer=Adam(lr=0.0001),
              metrics=['mse','mae'])

In [18]:
from sklearn.utils import shuffle
train_descriptor, train_image_data = shuffle(train_descriptor, train_image_data)

In [19]:
# validation

In [20]:
train_valid_split = 5000

In [21]:
valid_descriptor = train_descriptor[train_valid_split:]
valid_image_data = train_image_data[train_valid_split:]

In [22]:
train_descriptor = train_descriptor[:train_valid_split]
train_image_data = train_image_data[:train_valid_split]

In [23]:
checkpointer = ModelCheckpoint(filepath='NN_best_1.hdf5', verbose=1, save_best_only=True, save_weights_only=True)
reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0)
earlystopping = EarlyStopping(monitor='val_loss', patience=30)


In [24]:
model.fit(train_image_data,train_descriptor, epochs=100, verbose=2,
          validation_data=(valid_image_data, valid_descriptor), batch_size=16,
                    callbacks=[checkpointer])

Train on 5000 samples, validate on 659 samples
Epoch 1/100
 - 2s - loss: 0.7731 - mean_squared_error: 0.6034 - mean_absolute_error: 0.5862 - val_loss: 0.7320 - val_mean_squared_error: 0.5410 - val_mean_absolute_error: 0.5498

Epoch 00001: val_loss improved from inf to 0.73199, saving model to NN_best_1.hdf5
Epoch 2/100
 - 1s - loss: 0.7498 - mean_squared_error: 0.5666 - mean_absolute_error: 0.5656 - val_loss: 0.7292 - val_mean_squared_error: 0.5369 - val_mean_absolute_error: 0.5475

Epoch 00002: val_loss improved from 0.73199 to 0.72920, saving model to NN_best_1.hdf5
Epoch 3/100
 - 1s - loss: 0.7440 - mean_squared_error: 0.5583 - mean_absolute_error: 0.5614 - val_loss: 0.7228 - val_mean_squared_error: 0.5275 - val_mean_absolute_error: 0.5429

Epoch 00003: val_loss improved from 0.72920 to 0.72276, saving model to NN_best_1.hdf5
Epoch 4/100
 - 1s - loss: 0.7362 - mean_squared_error: 0.5465 - mean_absolute_error: 0.5558 - val_loss: 0.7162 - val_mean_squared_error: 0.5179 - val_mean_abso


Epoch 00032: val_loss did not improve from 0.69482
Epoch 33/100
 - 1s - loss: 0.6939 - mean_squared_error: 0.4849 - mean_absolute_error: 0.5261 - val_loss: 0.6954 - val_mean_squared_error: 0.4879 - val_mean_absolute_error: 0.5263

Epoch 00033: val_loss did not improve from 0.69482
Epoch 34/100
 - 1s - loss: 0.6931 - mean_squared_error: 0.4842 - mean_absolute_error: 0.5258 - val_loss: 0.6954 - val_mean_squared_error: 0.4878 - val_mean_absolute_error: 0.5263

Epoch 00034: val_loss did not improve from 0.69482
Epoch 35/100
 - 1s - loss: 0.6929 - mean_squared_error: 0.4835 - mean_absolute_error: 0.5255 - val_loss: 0.6952 - val_mean_squared_error: 0.4876 - val_mean_absolute_error: 0.5261

Epoch 00035: val_loss did not improve from 0.69482
Epoch 36/100
 - 1s - loss: 0.6921 - mean_squared_error: 0.4828 - mean_absolute_error: 0.5251 - val_loss: 0.6954 - val_mean_squared_error: 0.4879 - val_mean_absolute_error: 0.5264

Epoch 00036: val_loss did not improve from 0.69482
Epoch 37/100
 - 1s - los


Epoch 00068: val_loss did not improve from 0.69482
Epoch 69/100
 - 1s - loss: 0.6788 - mean_squared_error: 0.4636 - mean_absolute_error: 0.5152 - val_loss: 0.6997 - val_mean_squared_error: 0.4939 - val_mean_absolute_error: 0.5306

Epoch 00069: val_loss did not improve from 0.69482
Epoch 70/100
 - 1s - loss: 0.6780 - mean_squared_error: 0.4632 - mean_absolute_error: 0.5149 - val_loss: 0.6988 - val_mean_squared_error: 0.4927 - val_mean_absolute_error: 0.5297

Epoch 00070: val_loss did not improve from 0.69482
Epoch 71/100
 - 1s - loss: 0.6773 - mean_squared_error: 0.4626 - mean_absolute_error: 0.5147 - val_loss: 0.6989 - val_mean_squared_error: 0.4928 - val_mean_absolute_error: 0.5295

Epoch 00071: val_loss did not improve from 0.69482
Epoch 72/100
 - 1s - loss: 0.6773 - mean_squared_error: 0.4621 - mean_absolute_error: 0.5143 - val_loss: 0.6985 - val_mean_squared_error: 0.4922 - val_mean_absolute_error: 0.5291

Epoch 00072: val_loss did not improve from 0.69482
Epoch 73/100
 - 1s - los

<keras.callbacks.History at 0x7f59a79ca1d0>

In [78]:
model.load_weights('NN_best_1.hdf5')

In [79]:
model.evaluate(train_image_data, train_descriptor)



[0.6927539868328676, 0.48734503585261324, 0.5270197433746204]

In [80]:
model.evaluate(test_image_data, test_descriptor)



[0.7122413528397911, 0.5105672197638398, 0.5394385626896675]

In [81]:
train_embedding_image_preds = model.predict(train_image_list)
test_embedding_image_preds = model.predict(test_image_list)

In [82]:
np.save('train_embedding_image_preds',train_embedding_image_preds)
np.save('test_embedding_image_preds',test_embedding_image_preds)

In [83]:
train_embedding_image_preds.shape

(5659, 100)