## TO DO

- Delete empty images and add them to clean_dataset folder
- Figure out how to shuffle by user, rather than by photo, make comparisons within users
- Figure out how to do proper testing (code to pair up images randomly, predict their score using neural network, and then compare whether the ordinal match was right using our score)

### TO DO LATER
- Figure out how to incoroporate regression output
- Rewrite of code

## Create Neural Network

In [1]:
from keras import applications
from keras.layers import Input, Conv2D, MaxPooling2D, Dense, Dropout, Flatten
from keras.utils import np_utils

# If you want to specify input tensor
input_tensor = Input(shape=(160, 160, 3))
vgg_model = applications.VGG16(weights='imagenet',
                               include_top=False,
                               input_tensor=input_tensor)

# To see the models' architecture and layer names, run the following
vgg_model.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 160, 160, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 160, 160, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 160, 160, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 80, 80, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 80, 80, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 80, 80, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 40, 40, 128)       0         
__________

In [2]:
# Creating dictionary that maps layer names to the layers
layer_dict = dict([(layer.name, layer) for layer in vgg_model.layers])

# Getting output tensor of the last VGG layer that we want to include
x = layer_dict['block4_pool'].output

# Stacking a new simple convolutional network on top of it    
x = Conv2D(filters=64, kernel_size=(3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(10, activation='softmax')(x)

# Creating new model. Please note that this is NOT a Sequential() model.
from keras.models import Model
custom_model = Model(inputs=vgg_model.input, outputs=x)

# Make sure that the pre-trained bottom layers are not trainable
for layer in custom_model.layers[:15]:
    layer.trainable = False

# Do not forget to compile it
custom_model.compile(loss='categorical_crossentropy',
                     optimizer='rmsprop',
                     metrics=['accuracy'])

## Train Neural Network

In [2]:
import cv2
import numpy as np
import csv, sqlite3
import math
import os
import random

In [4]:
"""
Create dictionary with:
Key = filename root
Value = score
"""

con = sqlite3.connect("imagion.db")
cur = con.cursor()

table_cols = [i[0] for i in cur.execute("SELECT * FROM imagion").description]

def get_filenames(): 
    files_dict = {}
    
    cur.execute("SELECT filename, alias, scale_qsc FROM imagion")
    
    count = 0
    
    for file_, alias_, int_score in cur.fetchall():
        if count == 0:
            count += 1
            continue
            
        count += 1 
        
        int_score = int_score - 1
        
        files_dict[file_,alias_] = int_score
        
    return files_dict

files_dict = get_filenames()

In [3]:
"""Split training and test images"""

PERCENT_TRAINING = 0.75

random.seed(10)
keys = files_dict.keys()
split = int(len(files_dict.keys()) * PERCENT_TRAINING)

random.shuffle(keys) # revisit this shuffle function

#train_keys = keys[:split]
#test_keys = keys[split:]

#print train_keys[1]
#print train_keys[1][1]

seen = []
unique = []
for i in range(len(keys)):
    if keys[i][1] not in seen:
        unique.append(keys[i][1])
    seen.append(keys[i][1])
print len(unique)
print len(seen)

#Counting the number of times each unique name occurs
uni_count = {}
for i in range(len(unique)):
    uni_count[unique[i]] = seen.count(unique[i])
print len(uni_count.values())
print min(uni_count.values())
print max(uni_count.values())

##Pick 250 random users from unique list
test_samp = random.sample(unique, 250)

test_keys = []
train_keys = []
for i in range(len(keys)):
    if keys[i][1] not in test_samp:
        train_keys.append(keys[i][0])
    else:
        test_keys.append(keys[i][0])

print len(test_keys)
print len(train_keys)

NameError: name 'files_dict' is not defined

In [17]:
def chunks(l, n):
    """Yield successive n-sized chunks from l"""
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

# def get_train_data(chunk, img_row, img_col):
#     X_train = []
#     Y_train = []
    
#     try:
#         for imgname in chunk:
#             Y_train.append(files_dict[imgname])
#             filename = 'data_images'+'/'+imgname+'.png'
#             img = cv2.imread(filename)
#             img = cv2.resize(img,(img_row,img_col))
#             X_train.append(img)

#         X_train = np.asarray(X_train)
#         Y_train = np.asarray(Y_train)
        
#         return X_train,Y_train

#     except:
#         X_train=None
#         Y_train=None
#         return X_train,Y_train

def get_train_data(chunk, img_row, img_col):
    X_train = []
    Y_train = []
    
    for imgname in chunk:
        try:
            filename = 'data_images'+'/'+imgname+'.png'
            img = cv2.imread(filename)
            img = cv2.resize(img,(img_row,img_col))
            X_train.append(img)
            Y_train.append(files_dict[imgname])
        except: 
            continue
    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)

    return X_train,Y_train
    
    
    
    
def get_test_data(chunk, img_row, img_col):
    X_test = []
    Y_test = []
    
    for imgname in chunk:
        try:
            filename = './data_images'+'/'+imgname+'.png'
            img = cv2.imread(filename)
            img = cv2.resize(img,(img_row,img_col))
            X_test.append(img)
            Y_test.append(files_dict[imgname])
        except:
            continue
    X_test = np.asarray(X_test)
    Y_test = np.asarray(Y_test)

    return X_test,Y_test

def getTrainData(chunk,nb_classes,img_rows,img_cols):
    X_train,Y_train = get_train_data(chunk,img_rows,img_cols)
    if (X_train!=None and Y_train!=None):
        X_train/=255
    Y_train=np_utils.to_categorical(Y_train, num_classes = 10)
    return (X_train,Y_train)

def getTestData(chunk,nb_classes,img_rows,img_cols):
    X_test,Y_test = get_test_data(chunk,img_rows,img_cols)
    if (X_test!=None and Y_test!=None):
        X_test/=255
    Y_test=np_utils.to_categorical(Y_test, num_classes = 10)
    return (X_test,Y_test)

def test(model, nb_epoch, spatial_test_data, nb_classes, img_rows, img_cols):
    X_test,Y_test = getTestData(test_keys,nb_classes,img_rows,img_cols)
    return (X_test, Y_test)


In [18]:
## chunk_size = 10
num_epochs = 1
nb_epoch = 50
batch_size = 2
nb_classes = 10
chunk_size = 32
img_rows = 160
img_cols = 160


for e in range(nb_epoch):
    print('-'*40)
    print('Epoch', e)
    print('-'*40)
    print("Training...")
    instance_count=0


    for chunk in chunks(train_keys, chunk_size):
        X_chunk,Y_chunk=getTrainData(chunk,nb_classes,img_rows,img_cols)

        if (X_chunk!=None and Y_chunk!=None):
            #for X_batch, Y_batch in datagen.flow(X_chunk, Y_chunk, batch_size=chunk_size):
            loss = custom_model.fit(X_chunk, Y_chunk, verbose=1, batch_size=batch_size, epochs=num_epochs)
            instance_count+=chunk_size
            print instance_count
            if instance_count%100==0:
                custom_model.save_weights('basic_model.h5',overwrite=True)
            
                    
            

----------------------------------------
('Epoch', 0)
----------------------------------------
Training...
----------------------------------------
('Epoch', 1)
----------------------------------------
Training...
----------------------------------------
('Epoch', 2)
----------------------------------------
Training...
----------------------------------------
('Epoch', 3)
----------------------------------------
Training...
----------------------------------------
('Epoch', 4)
----------------------------------------
Training...
----------------------------------------
('Epoch', 5)
----------------------------------------
Training...
----------------------------------------
('Epoch', 6)
----------------------------------------
Training...
----------------------------------------
('Epoch', 7)
----------------------------------------
Training...
----------------------------------------
('Epoch', 8)
----------------------------------------
Training...
-------------------------------------

In [19]:
""" SUDO CODE FOR CUSTOM ACCURACY FUNCTION"""

def test_accuracy(keys):
    count = 0
    accuracy = 0
    
    # select two images randomly
    random.seed(10) 
    random.shuffle(keys.keys()) # revisit shuffle function
    
    # do for same
    
    for x in range(int(len(keys)/2)):
        # get two elements from shuffled dictionary
        key1 = keys.values().pop(0)
        key2 = keys.values().pop(1)
        
        # predict score for each image
        predict1 = custom_model.prediction(key1)
        predict2 = custom_model.prediction(key2)

        # compare whichever predicted score is higher
        max_predict = max(predict1, predict2)

        # compare whichever "score" is higher
        max_actual = max(keys.values()[0], keys.values()[1])

        # check if comparisons match 
        if max_predict == max_actual:
            accuracy +=1

        count += 1
    
    print count, "comparisons made"

    return (accuracy/count)    

In [30]:
import random

random.sample(files_dict.keys(), 2)


[u'galagonzalez_0', u'fluffypack_8']

In [34]:
files_dict.keys()

[u'uolesporte_1',
 u'archilovers_16',
 u'hairgod_zito_15',
 u'archilovers_14',
 u'archilovers_15',
 u'archilovers_12',
 u'archilovers_13',
 u'archilovers_10',
 u'archilovers_11',
 u'uolesporte_7',
 u'kitkat_ch_0',
 u'vacationwolf_12',
 u'kochifaraj_10',
 u'thefashionguitar_1',
 u'nadiadamaso_ebnl_10',
 u'uolesporte_9',
 u'badasscassfit__13',
 u'badasscassfit__12',
 u'badasscassfit__11',
 u'badasscassfit__10',
 u'badasscassfit__15',
 u'badasscassfit__14',
 u'ashybines_14',
 u'ashybines_15',
 u'ashybines_10',
 u'ashybines_11',
 u'ashybines_12',
 u'ashybines_13',
 u'raffinagita1717_7',
 u'streetsfashions_2',
 u'marioncameleon_3',
 u'hairgod_zito_14',
 u'marioncameleon_1',
 u'marioncameleon_0',
 u'marioncameleon_7',
 u'marioncameleon_6',
 u'marioncameleon_5',
 u'marioncameleon_4',
 u'kochifaraj_16',
 u'marioncameleon_9',
 u'marioncameleon_8',
 u'sonchicc_15',
 u'lioninthewild_15',
 u'riverviiperi_0',
 u'samblacky_9',
 u'samblacky_8',
 u'samblacky_5',
 u'hairgod_zito_10',
 u'samblacky_7',
 