In [1308]:
import gensim
import pandas as pd

# curl -o GoogleNews-vectors-negative300.bin.gz "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [143]:
import nltk, string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

# Reference https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords

stop_words = set([word.translate(str.maketrans('', '', string.punctuation))
                  for word in stopwords.words('english')])

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

[nltk_data] Downloading package stopwords to /Users/varun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/varun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/varun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [225]:
import pandas as pd
import sklearn
import nltk, string

from nltk.corpus import stopwords

stop_words = set([word.translate(str.maketrans('', '', string.punctuation))
                  for word in stopwords.words('english')])

def get_sanitized_descriptions(folder, count):
    sanitized_descriptions = []
    for i in range(0,count):
        with open('kaggle_data/%s/%s.txt' % (folder, i)) as f:
            descriptions = ' '.join(f.read().strip().split('\n'))
            descriptions = descriptions.translate(str.maketrans('', '', string.punctuation))
            all_words = descriptions.split(' ')
            sanitized_words = []
            for word in all_words:
                word = word.lower().strip()
                if not word or word in stop_words:
                    continue
                word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
                sanitized_words.append(word)
            sanitized_descriptions.append(' '.join(sanitized_words))
    return sanitized_descriptions
                  
train_descriptions = get_sanitized_descriptions('descriptions_train', 10000)
test_descriptions = get_sanitized_descriptions('descriptions_test', 2000)

In [226]:
dictionary = {}
for sd in train_descriptions:
    words = sd.split(' ')
    for w in words:
        dictionary[w] = dictionary.get(w, 0) + 1
        
dictionary = set(dictionary.keys())
len(dictionary)

7357

In [227]:
# word2vec representation
#all_dfs = []
#for desc in train_descriptions:
#    vecs = [word2vec.get_vector(w) for w in desc.split(' ') if w in word2vec.vocab]
#    all_dfs.append(pd.DataFrame(vecs).mean().to_frame().T)
#    
#train_df = pd.concat(all_dfs, ignore_index=True)
#all_dfs = []
#for desc in test_descriptions:
#    vecs = [word2vec.get_vector(w) for w in desc.split(' ') if w in word2vec.vocab]
#    all_dfs.append(pd.DataFrame(vecs).mean().to_frame().T)
#    
#test_df = pd.concat(all_dfs, ignore_index=True)

In [228]:
# BoW representation
def vectorize(row):
    words = row['sentence'].split(' ')
    for w in words:
        if w not in row:
            continue
        row[w] = row[w] + 1
    return row

df = pd.DataFrame(train_descriptions, columns=['sentence'])
features = pd.DataFrame(columns=dictionary)
train_df = pd.concat([df, features], axis=1).fillna(0)
train_df = train_df.apply(vectorize, axis=1).drop(['sentence'], axis=1)
train_df

df = pd.DataFrame(test_descriptions, columns=['sentence'])
features = pd.DataFrame(columns=dictionary)
test_df = pd.concat([df, features], axis=1).fillna(0)
test_df = test_df.apply(vectorize, axis=1).drop(['sentence'], axis=1)
test_df

Unnamed: 0,graffiti,euclid,temporary,number,pasty,sum,pagoda,every,auburnbrown,splatter,...,playful,baker,movement,snowwy,oxford,pastry,sprayed,petal,bomber,firetrucks
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [246]:
all_tags = []
for i in range(0,10000):
    with open('kaggle_data/tags_train/%s.txt' %  i) as f:
        all_tags.append(', '.join([a.split(':')[1] for a in f.read().strip().split('\n') if a]))
train_tags_df = pd.DataFrame(all_tags, columns=['tags'])
#train_tags_df = train_tags_df[tags_df['tags'] != '']
train_tags_df

all_tags = []
for i in range(0,2000):
    with open('kaggle_data/tags_test/%s.txt' %  i) as f:
        all_tags.append(', '.join([a.split(':')[1] for a in f.read().strip().split('\n') if a]))
test_tags_df = pd.DataFrame(all_tags, columns=['tags'])
test_tags_df

Unnamed: 0,tags
0,"bed, backpack, suitcase, tie"
1,cow
2,"frisbee, person"
3,"car, traffic light, truck"
4,"cat, bed"
...,...
1995,"sheep, person, backpack"
1996,"boat, person"
1997,"boat, bench"
1998,airplane


In [247]:
tags_set = set([t for tags in all_tags for t in tags.split(', ') if t])

def vectorize_tags(row):
    words = row['tags'].split(', ')
    for w in words:
        if w not in row:
            continue
        row[w] = row[w] + 1
    return row

features = pd.DataFrame(columns=tags_set)
train_tags_df = pd.concat([train_tags_df, features], axis=1).fillna(0)
train_tags_df = train_tags_df.apply(vectorize_tags, axis=1).drop(['tags'], axis=1)
train_tags_df

features = pd.DataFrame(columns=tags_set)
test_tags_df = pd.concat([test_tags_df, features], axis=1).fillna(0)
test_tags_df = test_tags_df.apply(vectorize_tags, axis=1).drop(['tags'], axis=1)
test_tags_df

Unnamed: 0,hot dog,cow,skis,elephant,sandwich,teddy bear,bird,traffic light,skateboard,refrigerator,...,motorcycle,cup,wine glass,knife,orange,sink,sports ball,snowboard,keyboard,fork
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1274]:
#train_test_df = pd.concat([train_df, test_df])
#
#from sklearn.decomposition import PCA
#pca = PCA(n_components=1000)
#principalComponents = pca.fit_transform(train_test_df)
#principalDf = pd.DataFrame(data = principalComponents)
#
#train_pca_df = principalDf.iloc[:10000, :]
#test_pca_df = principalDf.iloc[10000:, :]

In [None]:
#reduces accuracy
#from sklearn.preprocessing import scale, normalize, MinMaxScaler
#train_df_norm = normalize(scale(train_df))
#min_max_scaler = MinMaxScaler()
#train_df_norm = min_max_scaler.fit_transform(train_df)

In [231]:
# get the train resnet features and sort it by file_id 0 to n-1
train_features_df = pd.read_csv('kaggle_data/features_train/features_resnet1000intermediate_train.csv', header=None)
#train_features_df = pd.read_csv('kaggle_data/features_train/features_resnet1000_train.csv', header=None)
train_features_df['file_id'] = train_features_df[0].apply(lambda x: int(x.split("/")[1].split(".")[0]))
train_features_df = train_features_df.sort_values(by=['file_id'])
train_features_df = pd.DataFrame(train_features_df.drop([0, 'file_id'], axis=1).values)

In [232]:
# get the test resnet features and sort it by file_id 0 to n-1
test_features_df = pd.read_csv('kaggle_data/features_test/features_resnet1000intermediate_test.csv', header=None)
#test_features_df = pd.read_csv('kaggle_data/features_test/features_resnet1000_test.csv', header=None)
test_features_df['file_id'] = test_features_df[0].apply(lambda x: int(x.split("/")[1].split(".")[0]))
test_features_df = test_features_df.sort_values(by=['file_id'])
test_features_df = pd.DataFrame(test_features_df.drop([0, 'file_id'], axis=1).values)

In [233]:
from sklearn.decomposition import PCA
# train_test_features_df = pd.concat([train_features_df, test_features_df], axis=1, sort=False)
train_test_features_df = pd.concat([train_features_df, test_features_df])
# scaler = StandardScaler()
# scaler.fit(train_test_features_df)
# train_test_features_df = scaler.transform(train_test_features_df)
pca = PCA(n_components=75)
principalComponents = pca.fit_transform(train_test_features_df)
principalDf = pd.DataFrame(data = principalComponents)

In [234]:
train_features_df = principalDf.iloc[:10000, :]
test_features_df = principalDf.iloc[10000:, :]

In [184]:
# picking top 100 resnet features with most variance
#max_var_indices = pd.DataFrame(train_features_df.var()).sort_values(by=0).index[600:]
#train_features_df = train_features_df.loc[:, max_var_indices]
#test_features_df = test_features_df.loc[:, max_var_indices]

In [185]:
# project it randomly down to N features
# Increasing N will increase accuracy but will also increase time and resource usage
#import numpy as np
#N = 100
#rand_proj_df = pd.DataFrame(np.random.randn(1000, N))
#train_features_df = train_features_df.dot(rand_proj_df)
#test_features_df = test_features_df.dot(rand_proj_df)

In [235]:
# create datasets for cross validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df, train_features_df, test_size=0.2, random_state=42)

#x_tags_df = train_df.loc[X_train.index]

In [236]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 7357)
(2000, 7357)
(8000, 75)
(2000, 75)


In [355]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.cross_decomposition import PLSRegression

# NOTE: need to play around with different alpha values
parameters = {"alpha": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 15.0]}

# alternate model
# clf = RidgeCV(alphas=[10.0]).fit(X_train, y_train)

#reg = GridSearchCV(Ridge(), parameters,  cv=10, n_jobs=8)
#reg.fit(X_train, y_train)
#print(reg.best_estimator_)

clf = KernelRidge(alpha=10.5)
clf.fit(X_train, y_train)

x_tags_df = train_df.loc[train_tags_df.index]
y_tags_df = train_tags_df

clf2 = KernelRidge(alpha=0.001)
clf2.fit(x_tags_df, y_tags_df)

#clf = PLSRegression(n_components=400, scale=True, max_iter=100, tol=1e-07, copy=True)
#clf.fit(y_train, X_train)

KernelRidge(alpha=0.001, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)

In [362]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
def get_distances(x1, x2):
    return cosine_distances(x1, x2)

def foo(x, desc):
    return len(set(x.split(', ')) & set(desc.split(' '))) != 0
    
def get_distance(prediction, x2, description):
    filter_df = tags_df['tags'].apply(foo, desc=description)
    filter_df = filter_df[filter_df == True]
    return euclidean_distances([prediction], x2.loc[filter_df.index])

def get_tags(x, cols):
    tags = []
    for p in x:
        tags.append(set([cols[i] for (i,v) in enumerate(p) if v > 0.35]))
    return tags

def jaccard_similarity(s1, s2):
    if not s1 and not s2:
        return 0
    intersection = len(s1.intersection(s2))
    union = (len(s1) + len(s2)) - intersection
    return float(intersection) / union

# Test using cross validation before submitting to Kaggle
predictions = clf.predict(X_test)
predicted_tags = clf2.predict(X_test)

distances = get_distances(predictions, y_test)
tag_lists = get_tags(predicted_tags, cols=y_tags_df.columns)
true_tag_lists = get_tags(train_tags_df.loc[y_test.index].values, train_tags_df.columns)

MAP20_scores = []

for i in range(2000):
    dist = distances[i]
    tags = tag_lists[i]
    
    dist = [d - 3*jaccard_similarity(tags, true_tag_lists[j]) for (j,d) in enumerate(dist)]
    nearest_indexes = list(np.argsort(dist))
    pos = nearest_indexes.index(i)
    if pos < 20:
        MAP20_scores.append(1 / (pos + 1))
    else:
        MAP20_scores.append(0)

print("MAP@20 Score with Training Split:", np.mean(MAP20_scores))

MAP@20 Score with Training Split: 0.6966186882304142


In [359]:
pd.DataFrame(distances).mean()

0       0.981859
1       0.987569
2       1.012273
3       0.988551
4       0.999738
          ...   
1995    0.998291
1996    0.993589
1997    1.004077
1998    1.010571
1999    1.008412
Length: 2000, dtype: float64

In [363]:
# Now we train on all of our training data and test with the test data
#reg = GridSearchCV(Ridge(), parameters, cv=10)
#reg.fit(train_df, train_features_df)

clf = KernelRidge(alpha=10.51)
clf.fit(train_df, train_features_df)
#clf = RidgeCV(alphas=[10.0]).fit(train_df, train_features_df)

predictions = clf.predict(test_df)
predicted_tags = clf2.predict(test_df)

distances = get_distances(predictions, test_features_df)
tag_lists = get_tags(predicted_tags, cols=train_tags_df.columns)
true_tag_lists = get_tags(test_tags_df.values, train_tags_df.columns)

results = []
for i in range(2000):
    dist = distances[i]
    tags = tag_lists[i]
    
    dist = [d - 2.5*jaccard_similarity(tags, true_tag_lists[j]) for (j,d) in enumerate(dist)]
    nearest_indexes = list(np.argsort(dist))[:20]
    file_names = ' '.join(["%d.jpg" % i for i in nearest_indexes])
    results.append(file_names)

In [308]:
with open("my_submission_tags_bow_pca_75.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, r in enumerate(results):
        f.write("%d.txt,%s\n" % (i, r))

In [366]:
true_tag_lists

[{'backpack', 'bed', 'suitcase', 'tie'},
 {'cow'},
 {'frisbee', 'person'},
 {'car', 'traffic light', 'truck'},
 {'bed', 'cat'},
 {'bowl', 'broccoli', 'cup'},
 {'fork', 'pizza'},
 {'donut', 'person'},
 {'bench', 'potted plant'},
 {'book', 'cat', 'dining table'},
 {'giraffe'},
 {'book', 'cell phone', 'keyboard', 'laptop', 'mouse'},
 {'hot dog', 'person'},
 {'bench', 'car', 'laptop', 'person', 'truck'},
 {'banana', 'sandwich'},
 {'stop sign'},
 {'person', 'snowboard'},
 {'bus'},
 {'bed', 'book', 'cat'},
 {'bench'},
 {'bench', 'person', 'skateboard'},
 {'cell phone', 'chair', 'cup', 'handbag', 'laptop', 'person'},
 {'book', 'cat', 'handbag', 'person'},
 {'carrot', 'cell phone', 'cup', 'dining table', 'person'},
 {'bus', 'person', 'traffic light'},
 {'cup', 'person', 'tie'},
 {'kite', 'person'},
 {'book',
  'chair',
  'keyboard',
  'laptop',
  'mouse',
  'potted plant',
  'tv',
  'vase'},
 {'baseball bat', 'person'},
 {'boat', 'car', 'dog', 'person', 'truck'},
 {'bowl', 'cell phone'},
 {'fr

In [371]:
tag_lists

[{'car', 'cell phone', 'chair', 'handbag', 'person', 'traffic light'},
 {'bowl',
  'cake',
  'dining table',
  'handbag',
  'knife',
  'potted plant',
  'wine glass'},
 {'airplane', 'car', 'traffic light'},
 {'backpack', 'horse', 'person'},
 {'bench',
  'bowl',
  'cake',
  'car',
  'cell phone',
  'couch',
  'parking meter',
  'person'},
 {'person', 'skateboard'},
 {'person', 'skis'},
 {'person', 'zebra'},
 {'clock', 'person', 'truck'},
 {'person', 'skateboard'},
 {'person', 'traffic light', 'truck'},
 {'person', 'skateboard', 'spoon'},
 {'backpack', 'bottle', 'car', 'person'},
 {'bicycle', 'car', 'person', 'skis', 'traffic light'},
 {'person', 'skis'},
 {'chair', 'fork', 'pizza', 'truck'},
 {'frisbee', 'person'},
 {'person', 'traffic light'},
 {'elephant', 'person'},
 {'person', 'surfboard'},
 {'backpack',
  'bird',
  'car',
  'cell phone',
  'chair',
  'motorcycle',
  'person',
  'sheep'},
 {'frisbee', 'orange'},
 {'chair', 'dining table', 'orange'},
 {'boat',
  'bottle',
  'car',
  

In [321]:
test_descriptions

['woman walk street past doorway woman walk past doorway sidewalk woman talk cell phone check watch woman talk cell phone check watch walk sidewalk woman wear blue phone walk along sidewalk front building black planter side entrance',
 'large slice angel food cake sit top plate small plate contains large slice cake quarter cake plate large piece yellow cake sits plate large slab sponge cake sits upon flowery plate',
 'group traffic light sit intersection sign shin street light picture stoplight window sun surround airplane traffic light several street light airplane fly overhead',
 'two mean uniform rid horse side side sandy beach two police officer horse rid beach two people rid horse middle lot pair police officer ride horse beach two people neon vest rid horse beach',
 'man woman use pay phone adjacent booth adult use pay telephone outdoors stand next soda machine couple people phone stand together man woman talk public pay phone couple people talk phone',
 'man trick skateboard ska

In [365]:
train_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,-3.198189,-2.676107,4.103335,2.540348,-1.029768,0.079642,-0.816746,0.814384,-2.853737,2.325495,...,-0.626606,0.440102,0.835721,-0.041621,0.466458,-0.830872,1.311705,-0.067461,-1.004143,0.616629
1,6.928084,7.894668,0.445075,-3.084869,-3.431151,-2.474099,0.933023,0.894070,0.166969,-0.810289,...,-0.001998,1.019802,0.077593,-0.278619,-1.169324,-1.059128,-1.538426,-1.702732,0.329622,-0.218697
2,-2.830661,-2.707739,-3.806887,-4.214975,0.645904,1.669074,1.719747,-0.439539,0.394070,-2.093149,...,0.015659,0.187464,0.098616,-0.963655,1.958805,-0.047130,-0.391190,1.383112,0.926818,-1.459679
3,-4.859947,-0.929824,5.209822,1.359452,-1.522406,-5.114194,-0.432148,2.600395,1.906433,-0.896578,...,0.819500,0.750927,-0.915196,-0.234832,-1.515708,0.543909,-0.349653,-0.141075,-0.189578,-0.971446
4,-7.303117,-3.460256,4.205902,2.146481,-0.953888,-5.985312,3.071328,5.180649,1.782162,0.611810,...,0.317010,0.744810,-0.302980,0.372183,-0.193249,-0.862293,0.540945,-0.837940,0.280511,-0.776836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,11.607532,1.014447,0.683950,-1.746782,0.563267,-1.710933,-2.322818,4.522272,-4.280686,-1.111000,...,-1.602647,-1.870668,1.708291,0.914361,-1.432622,-0.976770,0.700162,1.601581,0.595365,-0.850901
9996,0.283505,3.625208,-0.673811,-1.679203,4.057946,-2.478829,-0.241606,2.483857,-2.898591,3.963364,...,-2.316351,1.666207,-0.511757,0.752088,-1.683298,0.437244,-1.197830,-1.881061,-0.134015,0.207739
9997,5.872889,-5.494159,-2.830367,8.356628,0.062231,1.755609,-2.136484,2.345146,0.606867,0.292911,...,-1.497790,0.749443,-0.353588,-1.032197,0.776374,-0.039927,1.669001,0.254555,1.112985,1.084994
9998,-3.907660,-4.912915,-8.197300,-6.618184,0.820778,2.468251,-0.127062,-2.984720,5.475470,-5.203071,...,0.773007,-0.220566,1.774135,-0.885791,-2.717078,-0.524367,-0.478297,-0.353951,-1.042634,0.811683
