In [1]:
import gensim
import pandas as pd

# curl -o GoogleNews-vectors-negative300.bin.gz "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [2]:
import nltk, string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

# Reference https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords

stop_words = set([word.translate(str.maketrans('', '', string.punctuation))
                  for word in stopwords.words('english')])

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yogeshverma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yogeshverma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yogeshverma/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
import pandas as pd
import sklearn
import nltk, string

from nltk.corpus import stopwords

stop_words = set([word.translate(str.maketrans('', '', string.punctuation))
                  for word in stopwords.words('english')])

def get_sanitized_descriptions(folder, count):
    sanitized_descriptions = []
    for i in range(0,count):
        with open('%s/%s.txt' % (folder, i)) as f:
            descriptions = ' '.join(f.read().strip().split('\n'))
            descriptions = descriptions.translate(str.maketrans('', '', string.punctuation))
            all_words = descriptions.split(' ')
            sanitized_words = []
            for word in all_words:
                word = word.lower().strip()
                if not word or word in stop_words:
                    continue
                word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
                sanitized_words.append(word)
            sanitized_descriptions.append(' '.join(sanitized_words))
    return sanitized_descriptions
                  
train_descriptions = get_sanitized_descriptions('descriptions_train', 10000)
test_descriptions = get_sanitized_descriptions('descriptions_test', 2000)

In [4]:
dictionary = {}
for sd in train_descriptions:
    words = sd.split(' ')
    for w in words:
        dictionary[w] = dictionary.get(w, 0) + 1
        
dictionary = set(dictionary.keys())
len(dictionary)

7357

In [1310]:
# word2vec representation
#all_dfs = []
#for desc in train_descriptions:
#    vecs = [word2vec.get_vector(w) for w in desc.split(' ') if w in word2vec.vocab]
#    all_dfs.append(pd.DataFrame(vecs).mean().to_frame().T)
#    
#train_df = pd.concat(all_dfs, ignore_index=True)
#all_dfs = []
#for desc in test_descriptions:
#    vecs = [word2vec.get_vector(w) for w in desc.split(' ') if w in word2vec.vocab]
#    all_dfs.append(pd.DataFrame(vecs).mean().to_frame().T)
#    
#test_df = pd.concat(all_dfs, ignore_index=True)

In [5]:
# BoW representation
def vectorize(row):
    words = row['sentence'].split(' ')
    for w in words:
        if w not in row:
            continue
        row[w] = row[w] + 1
    return row

df = pd.DataFrame(train_descriptions, columns=['sentence'])
features = pd.DataFrame(columns=dictionary)
train_df = pd.concat([df, features], axis=1).fillna(0)
train_df = train_df.apply(vectorize, axis=1).drop(['sentence'], axis=1)
train_df

df = pd.DataFrame(test_descriptions, columns=['sentence'])
features = pd.DataFrame(columns=dictionary)
test_df = pd.concat([df, features], axis=1).fillna(0)
test_df = test_df.apply(vectorize, axis=1).drop(['sentence'], axis=1)
test_df

Unnamed: 0,signpost,waistdeep,fecal,cocker,vacant,large,always,bagel,schoolbusboathouse,among,...,men,slop,sled,bus,probe,coast,piss,allwar,oldstyle,attire
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
all_tags = []
for i in range(0,10000):
    with open('tags_train/%s.txt' %  i) as f:
        all_tags.append(', '.join([a.split(':')[1] for a in f.read().strip().split('\n') if a]))
train_tags_df = pd.DataFrame(all_tags, columns=['tags'])
#train_tags_df = train_tags_df[tags_df['tags'] != '']
train_tags_df

all_tags = []
for i in range(0,2000):
    with open('tags_test/%s.txt' %  i) as f:
        all_tags.append(', '.join([a.split(':')[1] for a in f.read().strip().split('\n') if a]))
test_tags_df = pd.DataFrame(all_tags, columns=['tags'])
test_tags_df

Unnamed: 0,tags
0,"bed, backpack, suitcase, tie"
1,cow
2,"frisbee, person"
3,"car, traffic light, truck"
4,"cat, bed"
5,"bowl, broccoli, cup"
6,"fork, pizza"
7,"donut, person"
8,"potted plant, bench"
9,"cat, dining table, book"


In [7]:
tags_set = set([t for tags in all_tags for t in tags.split(', ') if t])

def vectorize_tags(row):
    words = row['tags'].split(', ')
    for w in words:
        if w not in row:
            continue
        row[w] = row[w] + 1
    return row

features = pd.DataFrame(columns=tags_set)
train_tags_df = pd.concat([train_tags_df, features], axis=1).fillna(0)
train_tags_df = train_tags_df.apply(vectorize_tags, axis=1).drop(['tags'], axis=1)
train_tags_df

features = pd.DataFrame(columns=tags_set)
test_tags_df = pd.concat([test_tags_df, features], axis=1).fillna(0)
test_tags_df = test_tags_df.apply(vectorize_tags, axis=1).drop(['tags'], axis=1)
test_tags_df

Unnamed: 0,cake,cow,bench,broccoli,potted plant,sink,keyboard,umbrella,cat,motorcycle,...,fork,boat,donut,carrot,couch,tennis racket,bus,truck,traffic light,spoon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# train_test_df.shape
#train_test_df = pd.concat([train_df, test_df])
#
#from sklearn.decomposition import PCA
#pca = PCA(n_components=1000)
#principalComponents = pca.fit_transform(train_test_df)
#principalDf = pd.DataFrame(data = principalComponents)
#
#train_pca_df = principalDf.iloc[:10000, :]
#test_pca_df = principalDf.iloc[10000:, :]

NameError: name 'train_test_df' is not defined

In [None]:
#reduces accuracy
#from sklearn.preprocessing import scale, normalize, MinMaxScaler
#train_df_norm = normalize(scale(train_df))
#min_max_scaler = MinMaxScaler()
#train_df_norm = min_max_scaler.fit_transform(train_df)

In [11]:
# get the train resnet features and sort it by file_id 0 to n-1
train_features_df = pd.read_csv('features_train/features_resnet1000intermediate_train.csv', header=None)
#train_features_df = pd.read_csv('kaggle_data/features_train/features_resnet1000_train.csv', header=None)
train_features_df['file_id'] = train_features_df[0].apply(lambda x: int(x.split("/")[1].split(".")[0]))
train_features_df = train_features_df.sort_values(by=['file_id'])
train_features_df = pd.DataFrame(train_features_df.drop([0, 'file_id'], axis=1).values)

In [12]:
# get the test resnet features and sort it by file_id 0 to n-1
test_features_df = pd.read_csv('features_test/features_resnet1000intermediate_test.csv', header=None)
#test_features_df = pd.read_csv('kaggle_data/features_test/features_resnet1000_test.csv', header=None)
test_features_df['file_id'] = test_features_df[0].apply(lambda x: int(x.split("/")[1].split(".")[0]))
test_features_df = test_features_df.sort_values(by=['file_id'])
test_features_df = pd.DataFrame(test_features_df.drop([0, 'file_id'], axis=1).values)

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# train_test_features_df = pd.concat([train_features_df, test_features_df], axis=1, sort=False)
train_test_features_df = pd.concat([train_features_df, test_features_df])
# scaler = StandardScaler()
# scaler.fit(train_test_features_df)
# train_test_features_df = scaler.transform(train_test_features_df)
pca = PCA(n_components=80)
principalComponents = pca.fit_transform(train_test_features_df)
principalDf = pd.DataFrame(data = principalComponents)

In [14]:
train_features_df = principalDf.iloc[:10000, :]
test_features_df = principalDf.iloc[10000:, :]

In [184]:
# picking top 100 resnet features with most variance
#max_var_indices = pd.DataFrame(train_features_df.var()).sort_values(by=0).index[600:]
#train_features_df = train_features_df.loc[:, max_var_indices]
#test_features_df = test_features_df.loc[:, max_var_indices]

In [185]:
# project it randomly down to N features
# Increasing N will increase accuracy but will also increase time and resource usage
#import numpy as np
#N = 100
#rand_proj_df = pd.DataFrame(np.random.randn(1000, N))
#train_features_df = train_features_df.dot(rand_proj_df)
#test_features_df = test_features_df.dot(rand_proj_df)

In [16]:
# create datasets for cross validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df, train_features_df, test_size=0.2, random_state=42)


In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 7357)
(2000, 7357)
(8000, 80)
(2000, 80)


In [76]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
# NOTE: need to play around with different alpha values
parameters = {"alpha": [0.001, 0.01, 0.1, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 15.0]}
distributions = dict(C=uniform(loc=0, scale=4),penalty=['l2', 'l1'])
# alternate model
# clf = RidgeCV(alphas=[10.0]).fit(X_train, y_train)

# clf = RandomizedSearchCV(KernelRidge(), distributions)
# clf2 = RandomizedSearchCV(KernelRidge(), distributions)

#reg.fit(X_train, y_train)
#print(reg.best_estimator_)

clf = KernelRidge(alpha=10)
clf.fit(X_train, y_train)

y_tags_df = train_tags_df.loc[X_train.index]
x_tags_df = train_df.loc[X_train.index]

clf2 = KernelRidge(alpha=20)
clf2.fit(x_tags_df, y_tags_df)



ValueError: Invalid parameter C for estimator KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None). Check the list of available parameters with `estimator.get_params().keys()`.

In [67]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
def get_distances(x1, x2):
    return cosine_distances(x1, x2)

def foo(x, desc):
    return len(set(x.split(', ')) & set(desc.split(' '))) != 0
    
def get_distance(prediction, x2, description):
    filter_df = tags_df['tags'].apply(foo, desc=description)
    filter_df = filter_df[filter_df == True]
    return euclidean_distances([prediction], x2.loc[filter_df.index])

def get_tags(x, cols):
    tags = []
    for p in x:
        tags.append(set([cols[i] for (i,v) in enumerate(p) if v > 0.25]))
    return tags

# Test using cross validation before submitting to Kaggle
predicted_desc = clf.predict(X_test)
predicted_tags = clf2.predict(X_test)
distances_desc = get_distances(predicted_desc, y_test)
distances_tags = get_distances(predicted_tags, train_tags_df.loc[X_test.index])

tag_lists = get_tags(predicted_tags, cols=y_tags_df.columns)
true_tag_lists = get_tags(train_tags_df.loc[y_test.index].values, train_tags_df.columns)

In [68]:
def jaccard_similarity(s1, s2):
    if not s1 and not s2:
        return 0
    intersection = len(s1.intersection(s2))
    union = (len(s1) + len(s2)) - intersection
    return float(intersection) / union

def normalize(matrix):
    return matrix / np.sqrt((matrix * matrix).sum(axis=1))
distances_tags_norm = normalize(distances_tags)
distances_desc_norm = normalize(distances_desc)

# distances = distances_tags + distances_desc
distances = np.absolute(distances_desc_norm) * np.absolute(distances_tags_norm)

In [71]:
#tags = get_tags(predicted_tags, cols=y_tags_df.columns)
MAP20_scores = []

for i in range(2000):
    dist = distances[i]
    tags = tag_lists[i]
    #nearest_indexes = list(np.argsort(get_distance(predictions[i], y_test, train_descriptions[y_test.iloc[i].name])))
    dist = [d - 3*jaccard_similarity(tags, true_tag_lists[j]) for (j,d) in enumerate(dist)]
    nearest_indexes = list(np.argsort(distances[i]))
    pos = nearest_indexes.index(i)
    if pos < 20:
        MAP20_scores.append(1 / (pos + 1))
    else:
        MAP20_scores.append(0)

print("MAP@20 Score with Training Split:", np.mean(MAP20_scores))

MAP@20 Score with Training Split: 0.3874195992324669


In [72]:
# Now we train on all of our training data and test with the test data
#reg = GridSearchCV(Ridge(), parameters, cv=10)
#reg.fit(train_df, train_features_df)

#clf = KernelRidge(alpha=10.0)
clf.fit(train_df, train_features_df)
#clf = RidgeCV(alphas=[10.0]).fit(train_df, train_features_df)

predictions_tags = clf2.predict(test_df)
predictions_desc = clf.predict(test_df)

# predictions_tags = clf_pls.predict(test_df)
# predictions_desc = clf_pls_2.predict(test_df)

distances_tags = get_distances(predictions_tags, test_tags_df)
distances_desc = get_distances(predictions_desc, test_features_df)
distances_tags_norm = normalize(distances_tags)
distances_desc_norm = normalize(distances_desc)

distances = distances_desc_norm * distances_tags_norm
results = []
for i in range(2000):
    nearest_indexes = list(np.argsort(distances[i]))[:20]
    file_names = ' '.join(["%d.jpg" % i for i in nearest_indexes])
    results.append(file_names)

In [73]:
with open("my_submission_tags_desc.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, r in enumerate(results):
        f.write("%d.txt,%s\n" % (i, r))