## TO DO

- Delete empty images and add them to clean_dataset folder [DONE]
- Figure out how to shuffle by user, rather than by photo, make comparisons within users [DONE]
- Figure out how to do proper testing (code to pair up images randomly, predict their score using neural network, and then compare whether the ordinal match was right using our score) [DONE]

- Change training code to pull only images from new training set

### TO DO LATER
- Figure out how to incoroporate regression output
- Rewrite of code

## Create Neural Network

In [16]:
from keras import applications, optimizers
from keras.layers import Input, Conv2D, MaxPooling2D, Dense, Dropout, Flatten
from keras.utils import np_utils

# If you want to specify input tensor
input_tensor = Input(shape=(160, 160, 3))
vgg_model = applications.VGG16(weights='imagenet',
                               include_top=False,
                               input_tensor=input_tensor)

# To see the models' architecture and layer names, run the following
vgg_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 160, 160, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 160, 160, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 160, 160, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 80, 80, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 80, 80, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 80, 80, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 40, 40, 128)       0         
__________

In [17]:
# Creating dictionary that maps layer names to the layers
layer_dict = dict([(layer.name, layer) for layer in vgg_model.layers])

# Make sure that the pre-trained bottom layers are not trainable
for layer in vgg_model.layers:
    layer.trainable = False

# Getting output tensor of the last VGG layer that we want to include
x = layer_dict['block5_pool'].output

# Stacking a new simple convolutional network on top of it    
x = Conv2D(filters=64, kernel_size=(3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = Dense(1)(x)

# Creating new model. Please note that this is NOT a Sequential() model.
from keras.models import Model
model = Model(inputs=vgg_model.input, outputs=x)

# Custom Optimizer    
opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=0.1, decay=1e-6)

# Do not forget to compile it
model.compile(loss='mse',
                     optimizer=opt,
                     metrics=['accuracy'])

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 160, 160, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 160, 160, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 160, 160, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 80, 80, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 80, 80, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 80, 80, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 40, 40, 128)       0         
__________

## Train Neural Network

In [5]:
import cv2
import numpy as np
import csv, sqlite3
import math
import os
import random

In [6]:
"""
Create dictionary with:
Key = filename root
Value = score
"""

con = sqlite3.connect("imagion.db")
cur = con.cursor()

table_cols = [i[0] for i in cur.execute("SELECT * FROM slimscores").description]

def get_filenames(): 
    files_dict = {}
    
    cur.execute("SELECT filename, norm_score FROM slimscores")
    
    count = 0
    
    for file_, score in cur.fetchall():
        if count == 0:
            count += 1
            continue
        
        files_dict[file_] = score
            
        count += 1 
        
    return files_dict

files_dict = get_filenames()

In [7]:
"""
Create dictionary with:
Key = alias
Value = list of filenames for user
"""

DATASET_DIR = 'data_images'

def create_user_dict(dataset_dir):
    user_dict = {}
    
    for filename in os.listdir(dataset_dir):
        filename = filename.rsplit('.', 1)[0]
        alias = filename.rsplit('_', 1)[0]
        
        # do not include outliers
        if filename not in files_dict.keys():
            continue
            
        if alias not in user_dict:
            user_dict[alias] = [filename]
        else:
            user_dict[alias].append(filename)
            
    return user_dict

user_dict = create_user_dict(DATASET_DIR)        

In [8]:
"""Split training and test images"""

PERCENT_TRAINING = 0.75 

random.seed(10)
keys = user_dict.keys()

split = int(len(user_dict.keys()) * PERCENT_TRAINING)

random.shuffle(keys) # revisit this shuffle function

train_users = keys[:split]
test_users = keys[split:]

train_keys = []
test_keys = []

for user in train_users:
    for filename in user_dict[user]:
        train_keys.append(filename)
        
for user in test_users:
    for filename in user_dict[user]:
        test_keys.append(filename)
        
random.shuffle(train_keys)
random.shuffle(test_keys)

In [9]:
def chunks(l, n):
    """Yield successive n-sized chunks from l"""
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

def get_train_data(chunk, img_row, img_col):
    X_train = []
    Y_train = []
    
    for imgname in chunk:
        try:
            filename = 'data_images'+'/'+imgname+'.png'
            img = cv2.imread(filename)
            img = cv2.resize(img,(img_row,img_col))
            X_train.append(img)
            Y_train.append(files_dict[imgname])
        except: 
            continue
    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)

    return X_train,Y_train
    
def get_test_data(chunk, img_row, img_col):
    X_test = []
    Y_test = []
    
    for imgname in chunk:
        try:
            filename = './data_images'+'/'+imgname+'.png'
            img = cv2.imread(filename)
            img = cv2.resize(img,(img_row,img_col))
            X_test.append(img)
            Y_test.append(files_dict[imgname])
        except:
            continue
    X_test = np.asarray(X_test)
    Y_test = np.asarray(Y_test)

    return X_test,Y_test

def getTrainData(chunk,img_rows,img_cols):
    X_train,Y_train = get_train_data(chunk,img_rows,img_cols)
    if (X_train is not None and Y_train is not None):
        X_train/=255
    return (X_train,Y_train)

def getTestData(chunk,img_rows,img_cols):
    X_test,Y_test = get_test_data(chunk,img_rows,img_cols)
    if (X_test is not None and Y_test is not None):
        X_test/=255
    return (X_test,Y_test)

def test(model, nb_epoch, spatial_test_data, img_rows, img_cols):
    X_test,Y_test = getTestData(test_keys,img_rows,img_cols)
    return (X_test, Y_test)


In [10]:
num_epochs = 1
nb_epoch = 100
batch_size = 2
chunk_size = 32
img_rows = 160
img_cols = 160

In [18]:
for e in range(nb_epoch):
    print('-'*40)
    print 'Epoch', e
    print('-'*40)
    print "Training..."
    instance_count=0


    for chunk in chunks(train_keys, chunk_size):
        X_chunk,Y_chunk=getTrainData(chunk,img_rows,img_cols)

        if (X_chunk is not None and Y_chunk is not None):
            loss = model.fit(X_chunk, Y_chunk, verbose=1, batch_size=batch_size, epochs=num_epochs)
            instance_count+=chunk_size
            
            print "Instance Count:", instance_count
            
            if instance_count%64==0:
                model.save_weights('vgg_model.h5',overwrite=True)

----------------------------------------
Epoch 0
----------------------------------------
Training...
Epoch 1/1
Instance Count: 32
Epoch 1/1
Instance Count: 64
Epoch 1/1
Instance Count: 96
Epoch 1/1
Instance Count: 128
Epoch 1/1
Instance Count: 160
Epoch 1/1

KeyboardInterrupt: 

## Load Weights

In [73]:
weights_path = 'vgg_model.h5'

if weights_path:
    model.load_weights(weights_path)

In [None]:
""" SUDO CODE FOR CUSTOM ACCURACY FUNCTION"""

DATASET_DIR = 'data_images'

def test_accuracy():
    count = 0
    accuracy = 0
    
    for user in test_users:
        user_imgs = user_dict[user]
        
        if len(user_imgs) > 1:
           # get two elements from shuffled dictionary
            element1 = user_imgs.pop(0)
            element2 = user_imgs.pop(1)
            
            # read images
            img1 = cv2.imread(os.path.join(DATASET_DIR, element1+'.png'))
            img2 = cv2.imread(os.path.join(DATASET_DIR, element2+'.png'))
            
            #resize images
            img1 = cv2.resize(img1,(img_rows,img_cols))
            img2 = cv2.resize(img2,(img_rows,img_cols))
            
            # expand dimension
            img1 = np.expand_dims(img1, axis=0)
            img2 = np.expand_dims(img2, axis=0)
        
            # predict score for each image
            predict1 = np.argmax(model.predict(img1))
            predict2 = np.argmax(model.predict(img2))
            
            # compare whichever predicted score is higher
            
            # comparison dict
            if predict1 > predict2:
                max_predict = 'a'
            elif predict1 < predict2:
                max_predict = 'b'
            else:
                max_predict = 'equal'
            
            # compare whichever "score" is higher
            if files_dict[element1] > files_dict[element2]:
                max_actual = 'a'
            elif files_dict[element1] < files_dict[element2]:
                max_actual = 'b'
            else:
                max_actual = 'equal'

            # check if comparisons match 
            if max_predict == max_actual:
                accuracy +=1

            count += 1
            
            print max_predict
            print max_actual
            print max_predict == max_actual
            print '\n'
            
        else:
            continue
    
    accuracy_per = accuracy / count
    
    print count, "comparisons made"
    print "Accuracy score is:", accuracy_per
    
    return

test_accuracy()

## Predict Test

In [31]:
img_name = 'xostylistxo_11'

img = cv2.imread('data_images/'+img_name+'.png')
img = cv2.resize(img,(160,160))
img = img/255
img = np.expand_dims(img, axis=0)

print model.predict(img, verbose=1)

[[ 0.41077155]]


In [20]:
img2 = cv2.imread('data_images/'+img_name+'.png')
img2 = cv2.resize(img2,(160,160))
img2 = img2/255
img2 = np.expand_dims(img2, axis=0)

print model.predict(img2, verbose=1)

[[ 0.47010535]]
