In [1]:
%matplotlib inline

import dill

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt
import random
from sklearn.neural_network import MLPClassifier

In [44]:
dill.load_session('nearest_neighbor.db')

In [45]:
# constants
n_train = 10000
n_test = 2000
top_k = 10
n_predict = 20

In [46]:
# load features
features_train_ff = pd.read_csv('data/features_train/features_resnet1000_train.csv', delimiter=',', index_col=0, header=None)
features_test_ff = pd.read_csv('data/features_test/features_resnet1000_test.csv', delimiter=',', index_col=0, header=None)

features_train_ff.index = features_train_ff.index.str.lstrip('images_train/').str.rstrip('.jpg')
features_train_ff.index = pd.to_numeric(features_train_ff.index, errors='coerce')
features_train_ff.sort_index(inplace=True)

features_test_ff.index = features_test_ff.index.str.lstrip('images_test/').str.rstrip('.jpg')
features_test_ff.index = pd.to_numeric(features_test_ff.index, errors='coerce')
features_test_ff.sort_index(inplace=True)

In [47]:
# word preprocessing
import re
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords

class Preprocess(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        # self.stopList = set(stopwords.words("english"))
        self.stopList = set([word.encode('ascii', 'ignore') for word in stopwords.words('english')])
        
    def preprocess(self, string, n_gram=1):
        
        # replace special character with space
        string = re.sub(r'[^a-zA-Z0-9 ]', r' ', string).encode('ascii', 'ignore')
        
        # Lemmatization (handles capitalization), ignoring stop word
        # turn output to ASCII and ignore special character
        ans = [self.stemmer.stem(word).encode('ascii', 'ignore') for word in string.split()]
        ans = [word for word in ans if word not in self.stopList]
        
        return ans

In [48]:
# load descriptions
descriptions_train = [set()] * n_train
processor = Preprocess()
for i in range(n_train):
    with open('data/descriptions_train/' + str(i) + '.txt') as f:
        words = f.read() # readlines()
        descriptions_train[i] = processor.preprocess(words)

descriptions_test = [set()] * n_test
for i in range(n_test):
    with open('data/descriptions_test/' + str(i) + '.txt') as f:
        words = f.read() # readlines()
        descriptions_test[i] = processor.preprocess(words)

In [49]:
# get bag of words features
def BagofWords(train, test):
    bag = set()
    for words in train:
        bag |= set(words)
    bag = list(bag)
    bag_idx = {x:i for i, x in enumerate(bag)}
    print(len(bag))
    # print(bag)
    
    # create feature vectors
    train_features = np.zeros((len(train), len(bag)))
    test_features = np.zeros((len(test), len(bag)))

    data = [train, test]
    features = [train_features, test_features]
    # '''
    for k in [0,1]:
        print('train/test: ', k)
        for i in xrange(len(data[k])):
            # if i%500 == 0: print(k, i)
            for word in data[k][i]:
                try:
                    features[k][i, bag_idx[word] ] += 1
                except KeyError: pass
    # '''
    return train_features, test_features, bag, bag_idx

train_features, test_features, bag, bag_idx = BagofWords(descriptions_train, descriptions_test)

6409
('train/test: ', 0)
('train/test: ', 1)


In [50]:
# post-process: L2 normalization
from sklearn.preprocessing import normalize
train_features = normalize(train_features, norm='l2', axis=1)
test_features = normalize(test_features, norm='l2', axis=1)

In [51]:
#Positive Examples
pos = [0 for i in range(10000)]
for i in range(10000):
    pos[i] = np.concatenate((features_train_ff.values[i],train_features[i]))

In [52]:
#Negative Examples
neg = [0 for i in range(10000)]
for i in range(10000):
    j = random.randint(0,10000-1)
    while j == i:
        j = random.randint(0,10000-1)
    neg[i] = np.concatenate((features_train_ff.values[i],train_features[j]))

In [53]:
#Lables for Neural Network
y_1 = np.ones(10000)
y_0 = np.zeros(10000)
y = np.concatenate((y_1,y_0))
print(y.shape)

(20000,)


In [54]:
#X data for Neural Network
X = np.concatenate((pos,neg))

In [55]:
clf = MLPClassifier(solver='adam', hidden_layer_sizes=(1000,100), max_iter = 100, random_state=1)

In [None]:
clf.fit(X,y)
#3:55

In [None]:
# ans = clf.predict_proba([np.concatenate((features_train_ff.values[0], train_features[10])) ] )
# ans

In [None]:
# ans[0, 1]

In [None]:
# don't run me
#ans  = [[0 for i in range(2000)] for j in range(2000)]
#for i in range(2000):
    #for j in range(2000):
        # ans[i][j] = clf.predict_proba([np.concatenate((features_test_ff.values[i], test_features[j])) ] )[0, 1]
        

In [None]:
#test_vector = []
#for i in range(2000):
    #for j in range(2000):
        #test_vector.append(np.concatenate((features_test_ff.values[i], test_features[j])))

In [None]:
ans_1 = []
for i in range(2000):
    test_vector = []
    for j in range(2000):
        test_vector.append(np.concatenate((features_test_ff.values[i], test_features[j])))
    ans_1.append( clf.predict_proba(test_vector) )

In [None]:
ans = np.asarray(ans_1)
ans = ans[:,:,1]
ans = np.transpose(ans)
print(ans.shape)
ans_sort = np.sort(ans, axis=1)[:, ::-1]
predict_neural = np.argsort(ans, axis=1)[:, ::-1]
print(predict_neural.shape)
predict_neural = predict_neural[:, :n_predict]
predict_neural.shape

In [None]:
print(ans_sort)

In [None]:
def scoring(predict, label):
    print(len(predict), len(label))
    assert(len(predict) == len(label))
    score = 0
    for i in range(len(predict)):
        try:
            idx = predict[i].index(label[i])
            score += (21 - idx) / 20
        except ValueError:
            print(label[i], predict[i])
            pass
    score /= len(predict)
    print(score)
    return score

In [None]:
# convert prediction to '0.jpg'
test_predict_str = [None] * n_test
for i in range(n_test):
    res = ' '.join([str(int(x)) + '.jpg' for x in predict_neural[i]])
    test_predict_str[i] = res # ' '.join([str(int(x)) + '.jpg' for x in test_predict[i]])

In [None]:
# write to csv
df = pd.DataFrame(data=test_predict_str)
df.index = [str(x) + '.txt' for x in range(n_test)]
df.to_csv('./neural_net.csv', mode='w', index=True, index_label='Descritpion_ID', header=['Top_20_Image_IDs'])
