In [17]:
import gensim
import pandas as pd

# curl -o GoogleNews-vectors-negative300.bin.gz "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [18]:
import pandas as pd
import sklearn
import nltk, string

from nltk.corpus import stopwords

stop_words = set([word.translate(str.maketrans('', '', string.punctuation))
                  for word in stopwords.words('english')])

def get_sanitized_descriptions(folder, count):
    sanitized_descriptions = []
    for i in range(0,count):
        with open('%s/%s.txt' % (folder, i)) as f:
            descriptions = ' '.join(f.read().strip().split('\n'))
            descriptions = descriptions.translate(str.maketrans('', '', string.punctuation)).lower()
            all_words = descriptions.split(' ')
            all_words = ' '.join([a for a in all_words if a and a not in stop_words])
            sanitized_descriptions.append(all_words)
    return sanitized_descriptions
                  
train_descriptions = get_sanitized_descriptions('descriptions_train', 10000)
test_descriptions = get_sanitized_descriptions('descriptions_test', 2000)

In [19]:
all_dfs = []
for desc in train_descriptions:
    vecs = [word2vec.get_vector(w) for w in desc.split(' ') if w in word2vec.vocab]
    all_dfs.append(pd.DataFrame(vecs).mean().to_frame().T)
    
train_df = pd.concat(all_dfs, ignore_index=True)

In [20]:
all_dfs = []
for desc in test_descriptions:
    vecs = [word2vec.get_vector(w) for w in desc.split(' ') if w in word2vec.vocab]
    all_dfs.append(pd.DataFrame(vecs).mean().to_frame().T)
    
test_df = pd.concat(all_dfs, ignore_index=True)

In [248]:
# get the train resnet features and sort it by file_id 0 to n-1
train_features_df = pd.read_csv('features_train/features_resnet1000intermediate_train.csv', header=None)

train_features_df['file_id'] = train_features_df[0].apply(lambda x: int(x.split("/")[1].split(".")[0]))
train_features_df = train_features_df.sort_values(by=['file_id'])
train_features_df = pd.DataFrame(train_features_df.drop([0, 'file_id'], axis=1).values)

In [249]:
# get the test resnet features and sort it by file_id 0 to n-1
test_features_df = pd.read_csv('features_test/features_resnet1000intermediate_test.csv', header=None)

test_features_df['file_id'] = test_features_df[0].apply(lambda x: int(x.split("/")[1].split(".")[0]))
test_features_df = test_features_df.sort_values(by=['file_id'])
test_features_df = pd.DataFrame(test_features_df.drop([0, 'file_id'], axis=1).values)

In [250]:
from sklearn.decomposition import PCA
pca = PCA(n_components=40)
principalComponents = pca.fit_transform(train_features_df)
principalComponentsTest = pca.fit_transform(test_features_df)
principalDf = pd.DataFrame(data = principalComponents)
principalDfTest = pd.DataFrame(data = principalComponentsTest)

In [251]:
# project it randomly down to N features
# Increasing N will increase accuracy but will also increase time and resource usage
# import numpy as np
# N = 100
# rand_proj_df = pd.DataFrame(np.random.randn(1000, N))
# train_features_df = train_features_df.dot(rand_proj_df)
# test_features_df = test_features_df.dot(rand_proj_df)

train_features_df= principalDf
test_features_df = principalDfTest

In [252]:
# create datasets for cross validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df, train_features_df, test_size=0.2, random_state=42)

In [253]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 300)
(2000, 300)
(8000, 40)
(2000, 40)


In [254]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
# NOTE: need to play around with different alpha values
parameters = {"alpha": [1e-3, 1e-2, 1e-1, 1,1.5, 1.6, 1.7, 1.8, 1.9, 1.0, 2.0, 2.1, 5.0, 10.0]}

# alternate model
# reg = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, y_train)
#reg = Lasso(alpha=1.0, fit_intercept=False, max_iter=1000)
reg = GridSearchCV(Ridge(), parameters, cv=10)
# reg = RandomForestRegressor(max_depth=21, n_estimators = 10 )

reg.fit(X_train, y_train)

#print(reg.best_estimator_)

















GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 1.5, 1.6, 1.7, 1.8, 1.9,
                                   1.0, 2.0, 2.1, 5.0, 10.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [255]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
9254,-0.134814,0.071277,0.070613,0.241917,-0.064312,-0.021846,0.112130,-0.120014,0.047124,0.168535,...,-0.132151,-0.070518,-0.178944,0.096851,0.223544,-0.029681,0.071548,0.011842,0.021107,-0.012435
1561,0.017210,0.050415,0.002243,0.039687,0.014022,-0.076203,-0.004041,-0.002799,0.100820,0.044364,...,-0.144972,0.042603,-0.067371,0.102861,-0.015256,-0.007908,0.037918,0.004917,-0.038465,-0.025766
1670,0.071973,0.046324,-0.059443,0.143150,-0.099685,-0.029737,-0.057013,-0.069565,-0.025018,0.042124,...,-0.138843,0.066073,-0.122126,0.075406,0.023651,0.023336,0.147047,-0.032822,0.050400,0.023963
6087,0.126829,0.057056,0.033933,0.036872,-0.007634,-0.065653,-0.014768,-0.071070,0.054724,0.106521,...,-0.090368,0.016135,-0.103212,-0.023135,-0.008181,-0.047793,0.046606,-0.075950,-0.013029,-0.056858
6669,0.084866,0.058632,-0.000573,0.061969,-0.025059,-0.138468,0.056932,-0.191198,0.099408,0.181222,...,-0.037149,0.125714,-0.123155,0.001261,0.091218,-0.007174,0.081376,-0.053279,-0.051698,-0.110275
5933,-0.012519,-0.016558,-0.011714,0.047901,-0.052373,0.013531,-0.027213,-0.009292,0.200529,0.065103,...,-0.014451,0.101887,-0.051450,-0.133451,0.051800,-0.063063,-0.020726,-0.040264,-0.034547,0.113080
8829,0.041037,0.042597,-0.001472,0.014137,0.018064,-0.033773,0.009221,-0.119247,0.212957,0.111987,...,-0.121839,-0.014946,-0.071243,0.004349,-0.040234,-0.063161,-0.046300,0.027016,-0.016311,0.036789
7945,0.035141,0.080948,0.076625,0.057253,0.005096,-0.132527,-0.071152,-0.149126,0.104248,0.059420,...,-0.056323,-0.022828,-0.077462,0.026508,-0.001063,-0.043917,-0.055017,0.088640,0.004071,-0.032998
3508,0.116845,0.077820,-0.039714,0.139904,-0.044317,-0.076826,0.058454,-0.118955,0.072085,0.071254,...,-0.141776,0.068856,-0.092884,0.060884,0.002385,0.043992,0.020043,0.098069,-0.098111,0.048676
2002,0.066383,0.080679,-0.004645,0.093209,0.031490,-0.070683,-0.105815,-0.094966,0.019358,0.069112,...,-0.111432,0.063770,-0.162550,0.027990,0.058197,0.066707,0.074374,-0.014873,0.064057,-0.056231


In [256]:
def get_distances(x1, x2):
    return ((np.expand_dims(x1, 1) - np.expand_dims(x2, 0)) ** 2).sum(2) ** 0.5

# Test using cross validation before submitting to Kaggle
predictions = reg.predict(X_test)
distances = get_distances(predictions, y_test)
MAP20_scores = []

for i in range(2000):
    nearest_indexes = list(np.argsort(distances[i]))
    pos = nearest_indexes.index(i)
    if pos < 20:
        MAP20_scores.append(1 / (pos + 1))
    else:
        MAP20_scores.append(0)

print("MAP@20 Score with Training Split:", np.mean(MAP20_scores))

MAP@20 Score with Training Split: 0.21049127899104678


In [257]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
6252,0.017921,0.106725,-0.030634,0.054845,0.044957,0.031072,-0.039654,-0.052540,0.072115,0.151714,...,-0.065237,0.035957,-0.177579,0.010632,-0.024804,-0.044227,0.019983,-0.057988,0.041311,0.038201
4684,0.005262,-0.084412,0.004868,0.082245,0.013415,-0.043631,-0.051100,0.018188,0.219352,0.099112,...,-0.164934,0.049472,0.052911,-0.046089,0.046351,-0.004745,-0.023220,-0.013192,-0.120950,0.013088
1731,0.001451,0.124614,-0.007332,0.061941,0.046660,-0.181398,0.043363,-0.072598,-0.044183,0.096499,...,-0.086753,0.111326,-0.120572,0.038808,0.096347,0.101750,0.002883,-0.020051,0.000284,0.021395
4742,0.068988,0.091426,-0.045505,0.124221,0.022984,-0.012526,0.037808,-0.083720,0.169088,0.073104,...,-0.206760,0.098432,-0.130950,0.004034,0.058842,-0.106982,0.090600,0.152027,-0.061700,0.025135
4521,-0.023677,0.128823,-0.009570,-0.061572,-0.023501,0.089072,0.007411,-0.104023,0.215183,0.090688,...,-0.110981,0.008601,-0.053300,-0.021880,-0.030649,-0.137002,0.002617,0.029348,0.059407,0.087295
6340,-0.075993,-0.003251,0.031575,0.283020,-0.036281,0.028497,0.045065,-0.102402,0.014968,0.161064,...,-0.053509,-0.042946,-0.103023,0.131069,0.074286,-0.020410,0.056423,-0.024595,0.058555,0.015312
576,0.125615,0.084816,0.028951,-0.012896,0.003062,-0.139257,-0.007726,-0.066257,0.248413,0.073706,...,-0.186747,-0.020815,-0.173774,0.007718,-0.001320,-0.013799,-0.059204,-0.106583,0.105428,-0.125376
5202,0.047503,0.128965,0.017497,-0.008397,0.023505,-0.061257,-0.036525,-0.132634,0.071341,0.121878,...,-0.140721,-0.015309,-0.177005,0.033127,-0.026363,-0.032248,-0.066809,-0.056916,0.115196,-0.036652
6363,-0.131516,0.098780,0.058466,0.220253,-0.052562,-0.050125,0.173286,-0.109013,-0.099146,0.189651,...,-0.111126,-0.058901,-0.104783,0.075684,0.138213,0.033024,0.010443,-0.052295,0.099639,0.043130
439,0.038722,0.101050,0.069667,-0.012544,-0.103788,0.014507,-0.000009,-0.056526,0.142404,0.138466,...,-0.088926,0.110352,-0.141654,0.059169,0.028274,-0.049009,-0.010534,0.064610,-0.054256,0.021882


In [258]:
# Now we train on all of our training data and test with the test data
reg = GridSearchCV(Ridge(), parameters, cv=10)
reg.fit(train_df, train_features_df)

predictions = reg.predict(test_df)
distances = get_distances(predictions, test_features_df)

results = []
for i in range(2000):
    nearest_indexes = list(np.argsort(distances[i]))[:20]
    file_names = ' '.join(["%d.jpg" % i for i in nearest_indexes])
    results.append(file_names)

















In [259]:
with open("my_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, r in enumerate(results):
        f.write("%d.txt,%s\n" % (i, r))

In [246]:
predictions

array([[ 6.43531480e+00, -3.80438252e+00, -1.20977464e-01, ...,
        -4.43574203e-01,  1.30105357e+00, -4.18195585e-01],
       [ 5.94428445e+00, -3.51024906e+00, -2.41488733e+00, ...,
         1.99267875e+00, -4.08730731e-01, -1.56655555e+00],
       [-3.42723884e+00,  5.78462196e+00, -5.77696891e-01, ...,
         1.52129380e-01,  6.01735497e-01,  1.01406514e+00],
       ...,
       [ 8.87225509e+00, -6.21543770e+00, -2.92660940e+00, ...,
        -7.68334190e-01,  1.04823425e+00,  7.04712335e-01],
       [-4.88289622e+00, -4.43720914e+00,  1.01681060e+01, ...,
         7.91451741e-02, -1.87544432e-01, -1.82602902e-01],
       [-5.44032987e+00,  3.08751302e+00, -1.01859312e+00, ...,
         3.53526298e-01, -6.41968229e-03, -4.56882855e-01]])