### Load

In [1]:
from __future__ import print_function
import os
import csv
import random
import gensim
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt

In [2]:
num_predict = 20

num_train = 8000
num_dev = 2000
num_test = 2000

split_idx = list(range(num_train + num_dev))
random.shuffle(split_idx)

In [20]:
# Description Parsing
ngram_range = (1,1)
min_df = 8
binary = True
norm = None

# ResNet Parsing
intermediate = False

# weighting
w_i = 13 # 23
w_t = 1


### Parse Descriptions and tags


In [4]:
from word_parser import word_parser
w_parser = word_parser(split_idx=split_idx, num_train=num_train, num_dev=num_dev, num_test=num_test, ngram_range=ngram_range, min_df=min_df, binary=binary, norm=norm)
d_train, d_dev, d_test = w_parser.parse_descriptions()
t_train, t_dev, t_test = w_parser.parse_tags()

Built all d matrices!
('d_train shape:', (8000, 2588))
('d_dev shape:', (2000, 2588))
('d_test shape:', (2000, 2588))
Built all t matrices!
('t_train shape:', (8000, 2588))
('t_dev shape:', (2000, 2588))
('t_test shape:', (2000, 2588))


### Parse ResNet Features

In [5]:
from parse_features import *
i_train, i_dev, i_test = parse_features(split_idx, num_train, num_dev, num_test, intermediate=intermediate)

Built all y matrices!
('i_train shape:', (8000, 1000))
('i_dev shape:', (2000, 1000))
('i_test shape:', (2000, 1000))


### Regression

In [17]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GridSearchCV

# Ridge
parameters = {"alpha": [10.0]}
reg = GridSearchCV(Ridge(), parameters, cv=10, verbose=1)

# Ridge
# reg = RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0])

reg.fit(i_train, d_train)
# reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   21.1s finished


Trained linear regression model!
Summary of best model:
GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [10.0]}, pre_dispatch='2*n_jobs', refit=True,
       return_train_score='warn', scoring=None, verbose=1)


### Cross-Validation

In [18]:
d_dev_pred = reg.predict(i_dev)
dist_i = cdist(d_dev, d_dev_pred, metric='sqeuclidean')
dist_t = cdist(d_dev, t_dev, metric='sqeuclidean')

In [21]:
from scoring import *

dist_all = dist_i * w_i + dist_t * w_t

dist_idx = np.argsort(dist_all, axis=1)
scoring(dist_idx, num_dev)

('Development MAP@20:', array([0.213]))
('Mean index of true image', 17.899)
('Median index of true image', 5.0)


### Test and write to file

In [9]:
d_train_all = np.concatenate([d_train, d_dev])
i_train_all = np.concatenate([i_train, i_dev])
t_train_all = np.concatenate([t_train, t_dev])

reg.fit(i_train_all, d_train_all)

d_test_pred = reg.predict(i_test)
dist_i = cdist(d_test, d_test_pred, metric='sqeuclidean')
dist_t = cdist(d_test, t_test, metric='sqeuclidean')

dist_all = dist_i * w_i + dist_t * w_t

dist_idx = np.argsort(dist_all, axis=1)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   31.1s finished


In [12]:
from write_to_csv import *
write_to_csv(dist_idx, './regression.csv', num_predict=num_predict, num_test=num_test)

### Show Testing Result

In [None]:
# print images
# x = 8
img_list = [1272, 1854, 332, 1538, 111]

fig, axes = plt.subplots(nrows=1, ncols=5, constrained_layout=False)

# for i in range(5):
for x, i in enumerate(img_list):
    # img = dist_idx[x, i]
    img = plt.imread('data/images_test/' + str(i) + '.jpg')
    axes[x].imshow(img)
    axes[x].set_title(i)
# fig.suptitle('All Images', fontsize=16)
fig.savefig('figures/neural_network.jpg')