This notebook contains code to test base models, make ensambles of several models and train 2-layer NN on top of base models predictions.

## 0.Imports

In [None]:
import bson
import numpy as np
import pandas as pd
import os
import io
from scipy.misc import imread, imresize
from keras.models import Model, load_model
from keras.applications.xception import Xception, preprocess_input
from keras.layers.core import Lambda
from keras.models import Model, Sequential
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, Flatten, Dense, merge
import tensorflow as tf

num_classes = 1054

## 1. Utils

In [None]:
def make_parallel(model, gpu_count):
    def get_slice(data, idx, parts):
        shape = tf.shape(data)
        size = tf.concat([ shape[:1] // parts, shape[1:] ],axis=0)
        stride = tf.concat([ shape[:1] // parts, shape[1:]*0 ],axis=0)
        start = stride * idx
        return tf.slice(data, start, size)

    outputs_all = []
    for i in range(len(model.outputs)):
        outputs_all.append([])

    #Place a copy of the model on each GPU, each getting a slice of the batch
    for i in range(gpu_count):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('tower_%d' % i) as scope:

                inputs = []
                #Slice each input into a piece for processing on this GPU
                for x in model.inputs:
                    input_shape = tuple(x.get_shape().as_list())[1:]
                    slice_n = Lambda(get_slice, output_shape=input_shape, arguments={'idx':i,'parts':gpu_count})(x)
                    inputs.append(slice_n)                
                outputs = model(inputs)
                
                if not isinstance(outputs, list):
                    outputs = [outputs]
                
                #Save all the outputs for merging back together later
                for l in range(len(outputs)):
                    outputs_all[l].append(outputs[l])

    # merge outputs on CPU
    with tf.device('/cpu:0'):
        merged = []
        for outputs in outputs_all:
            merged.append(merge(outputs, mode='concat', concat_axis=0))
            
        new_model = Model(input=model.inputs, output=merged)
        ## to save initial model
        funcType = type(model.save)
        # monkeypatch the save to save just the underlying model
        def new_save(self_,filepath, overwrite=True):
            model.save(filepath, overwrite)
        new_model.save=funcType(new_save, new_model)
        return new_model

In [None]:
def add_new_last_layer(base_model, nb_classes):
    #add new layers
    x = base_model.output
    x = GlobalAveragePooling2D()(x) #new global pooling layer layer
    #x = Dense(1024, activation='relu')(x)
    #x = Dropout(0.3)(x)
    #x = Dense(512, activation='relu')(x)
    #x = Dropout(0.3)(x)
    predictions = Dense(num_classes, activation='softmax')(x) #new softmax layer
    model = Model(input=base_model.input, output=predictions)
    return model

## 2. Load pretrained models 

In [None]:
with tf.device('/gpu:0'):
    main_model = load_model('models/114.h5', custom_objects={'tf':tf})

In [None]:
with tf.device('/gpu:1'):
    additional_model = load_model('models/85.h5', custom_objects={'tf':tf})

## 3. Test single and additional models combined on train data

In [None]:
def get_image(pic):
    x = imread(io.BytesIO(pic['picture']))
    x = imresize(x, (180, 180))
    x = np.array(x, np.float32)
    x = np.expand_dims(x, axis=0)
    img = preprocess_input(x)
    return img

In [None]:
from glob import glob
def indices():
    folder = './data/files/train'
    categories = [item[19:-1] for item in sorted(glob("./data/files/train/*/"))]
    indices2class = dict(zip(range(len(categories)), categories))
    return indices2class

In [None]:
from keras.preprocessing import image
from tqdm import tqdm 

with open('./data/train.bson', 'rb') as fbson:
    data = bson.decode_file_iter(fbson)
    batch = np.empty((0,180,180,3))
    ids = []
    results = []
    results_1 = []
    results_2 = []
    categories = []
    #input_data = pd.DataFrame({'_id' : [], 'img' : []}, dtype = 'int32')
    j = 0
    indices2class = indices()
    start = time.time()
    for c, d in enumerate(data):
        _id = d['_id']
        category = d['category_id']
        pics = d['imgs']
        ids.append(_id)
        categories.append(category)
        for e, pic in enumerate(d['imgs']):
            if e == 0:
                img_0 = get_image(pic)
            else:
                batch = np.append(batch, get_image(pic), axis=0)
        #predicting on model 2 - additional images
        batch_size = batch.shape[0]
        if batch_size != 0:
            probabilities_2 = model2.predict(batch)
            probabilities_sum_2 = np.sum(probabilities_2, axis = 0)/batch_size
            prediction_2 = np.argmax(probabilities_sum_2)
            predict_2 = int(indices2class[prediction_2])
            results_2.append(predict_2)
            batch = np.empty((0,180,180,3))
        else:
            results_2.append(0)

        #predicting on model 1 - single image
        probabilities_1 = model1.predict(img_0)
        prediction_1 = np.argmax(probabilities_1)
        predict_1 = int(indices2class[prediction_1])
        results_1.append(predict_1)

        #predicting on combined model 1&2            
        if batch_size != 0:
            probabilities = probabilities_1 + probabilities_sum_2
        else:
            probabilities = probabilities_1

        prediction = np.argmax(probabilities)
        predict = int(indices2class[prediction])
        results.append(predict)

        #j+= 1
        #if j > 5000:
    result = pd.DataFrame(np.column_stack((ids, results_1, results_2, results, categories)), dtype = 'int32')
    result.columns = ['_id', 'predict_1', 'predict_2', 'predict', 'category_id']
    result.to_csv("result.csv", index = False)
    print("saved")
    #print(result)
    print("done")
    print("{} seconds passed".format(start - time.time()))
    j = 0

## 4. Test models from bson file

In [None]:
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Activation
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing import image 
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import load_img, img_to_array
from keras.optimizers import RMSprop, SGD
from keras.utils import to_categorical
from keras.utils import multi_gpu_model
from matplotlib import pyplot as plt
from PIL import Image
import random, csv, bson, io, tqdm, time
import pandas as pd
import numpy as np 
from scipy.misc import imread, imresize
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

num_classes = 5270

img_width = 180
img_height = 180

def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

categories_df = pd.read_csv("categories.csv", index_col=0)
cat2idx, idx2cat = make_category_tables()

In [None]:
def preprocess_image(x):
    #x = np.expand_dims(x, axis=0)
    img = preprocess_input(x)    
    return x

In [None]:
def crop_image(x, n = 4):
    output_size = (180,180)
    result = []
    for i in range(n):
        x = imresize(x, (224, 224))
        w, h, c = x.shape
        th, tw = output_size
        h_diff = h - th
        w_diff = w - tw
        i = random.randint(0, h_diff)
        j = random.randint(0, w_diff)
        x= x[j:j + tw, i:i + th,:]
        x = np.array(x, np.float32)
        x = preprocess_image(x)
        result.append(x)
    return result

In [None]:
def getId(probabilities):
    prediction = np.argmax(probabilities)
    predict = int(idx2cat[prediction])
    return predict

In [None]:
def idx2indices():
    idx2cat
    indices2class

In [None]:
idx2indices = {}
for indice, clas in indices2class.items():
    idx2indices[cat2idx[int(clas)]] = indice

In [None]:
def indices2idxprob(probs):
    new_probs = np.empty(probs.shape)
    for idx, x in enumerate(probs):
        new_probs[idx] = probs[idx2indices[idx]]
    return new_probs

In [None]:
from keras.preprocessing import image
from tqdm import tqdm 
import time
import pdb

n = 4

with open('./data/train.bson', 'rb') as fbson:
    data = bson.decode_file_iter(fbson)
    batch = np.empty((0,180,180,3))
    ids = []
    results = []
    main_batch = []
    additional_batch = []
    input_data = pd.DataFrame({'_id' : [], 'img' : []}, dtype = 'int32')
    j = 0
    indices2class = indices()
    for c, d in tqdm(enumerate(data)):
        if c < 5100:
            continue
        #start_time = time.time()
        _id = d['_id']
        category = d['category_id']
        i = 0  
        k = 0
        for e, pic in enumerate(d['imgs']):
            img = load_img(io.BytesIO(pic['picture']), target_size = (180, 180))
            x = img_to_array(img)
            #pdb.set_trace()
            croped_imgs = crop_image(x, n)
            main_batch.append(preprocess_image(x))
            main_batch = main_batch + croped_imgs
            if e != 0:
                additional_batch.append(preprocess_image(x))
                #additional_batch = additional_batch + croped_imgs
                k += 1
            i += 1
        prob_main = main_model.predict(np.array(main_batch))
        if (k > 0):
            prob_additional = additional_model.predict(np.array(additional_batch))
        print("")
        print(str(category) + "actual")
        print("")
        prob_main_mean = np.mean(prob_main, axis = 0)
        if (k > 0):
            prob_additional_mean = np.mean(prob_additional, axis = 0)
        #for i in range((e+1)*(n+1)):
        #print(getId(prob_main_mean))
        #prob_main_sum = np.sum(prob_main, axis = 0)
        #probabilities_sum = np.sum(probabilities, axis = 0)  
        #prediction = np.argmax(probabilities_sum)
        #predict = int(indices2class[prediction])
        #ids.append(_id)
        #results.append(predict)
        #second_time = time.time()
        main_batch = []
        additional_batch = []
        k = 0
        #j+= 1
        #finish_time = time.time()
        if c > 5150:
            break
    result = pd.DataFrame(np.column_stack((ids, results)), dtype = 'int32')
    result.columns = ['_id', 'category_id']
    result.to_csv("result.csv", index = False)
    print("saved")
    j = 0

## 5. Test from batch one model with multicropping and avereging

In [None]:
def get_image_new(pic):
    img = load_img(io.BytesIO(pic['picture']), target_size = (180, 180))
    x = img_to_array(img)
    img = preprocess_input(x)
    return img

In [None]:
model_additional = model2
model_single = model1
batch_size = 256

with open('./data/train.bson', 'rb') as fbson:
    data = bson.decode_file_iter(fbson)
    single = []
    additional = []
    additional_images = {}
    additional_images[0] = 0
    categories = []
    batch = []
    m = 0
    n = 0 # product index in a batch
    k = 0 # additional_images index in a batch
    start_time = time.time()
    for c, d in enumerate(data):  
        _id = d['_id']
        category = d['category_id']
        pics = d['imgs']
        categories.append(category)
        
        for e, pic in enumerate(d['imgs']):
            if e == 0:
                single.append(get_image_new(pic))
            else:
                additional.append(get_image_new(pic))
                k += 1
        #print("First --- %s seconds ---" % (time.time() - start_time))
        #start_time = time.time()
        additional_images[n + 1] = k
        n += 1
        #print("Second --- %s seconds ---" % (time.time() - start_time))
        if n == batch_size*20:
            result = predict(np.array(single), np.array(additional), additional_images, categories)
            #encode Y
            encoded_Y = encoder.transform(categories)
            categories_one_hot = to_categorical(encoded_Y, num_classes = 5270)

            k = 0
            n = 0
            additional_images = {}
            additional_images[0] = 0
            single = []
            additional = []
            result = []
            categories = []

        if c % 14000 == 0:
            #print("Generating batch  --- %s seconds ---" % (time.time() - start_time))
            #start_time = time.time()
            print(str(c) + " products done. " + str(c/7000000*100) + "%%. --- %s seconds ---" % (time.time() - start_time))

In [None]:
def predict(single_data, additional_data, additional_images, categories):
    result = []
    num_single = len(single_data)
    num_additional = len(additional_data)

    datagen = ImageDataGenerator(
    )
    
    single_train_generator = datagen.flow(
        single_data,
        batch_size=batch_size,
        shuffle = False,
        )
    
    additional_train_generator = datagen.flow(
        single_data,
        batch_size=batch_size,
        shuffle = False,
        )

    #print("Generating batch  --- %s seconds ---" % (time.time() - start_time))
    #start_time = time.time()
    
    probabilities_additional = model_additional.predict_generator(additional_train_generator,
                                                                   steps = num_additional//batch_size,
                                                                   workers = 16)
    #predicting on single model (first image)
    probabilities_single = model_single.predict_generator(single_train_generator, 
                                                           steps = num_single//batch_size,
                                                           workers = 16)
    #print("Prediciting --- %s seconds ---" % (time.time() - start_time))
    #start_time = time.time()            

    #compose resulting data frame
    for i in range(n):
        probabilities_additional_i_flatten = np.empty(0)
        if additional_images[i + 1] - additional_images[i] != 0:
            probabilities_additional_i_flatten = probabilities_additional[additional_images[i]:additional_images[i + 1],:].flatten()
        probabilities = []

        probabilities = probabilities_single[i,:].tolist()
        if probabilities_additional_i_flatten.shape[0] != 0:
            probabilities = probabilities + probabilities_additional_i_flatten.tolist()
        # adding zeros at the end of every row if we have less than 4 photos
        zeros = [0] * (4* num_classes - len(probabilities))
        probabilities =  probabilities + zeros

        assert len(probabilities) == 4* num_classes
        #print(len(probabilities))
        result.append(probabilities)
    #print("Rearenging results --- %s seconds ---" % (time.time() - start_time))
    #start_time = time.time()
    return result