In [108]:
import matplotlib.pyplot as plt
import numpy as np
import imageio
import csv

descriptions_train_dir = 'data/descriptions_train/'
features_train_dir = 'data/features_train/'
images_train_dir = 'data/images_train/'
tags_train_dir = 'data/tags_train/'

descriptions_test_dir = 'data/descriptions_test/'
features_test_dir = 'data/features_test/'
images_test_dir = 'data/images_test/'
tags_test_dir = 'data/tags_test/'

In [109]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk
import string

table = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def parse_description(i, nouns_only):
    descriptions_train_file = open(descriptions_train_dir + str(i) + '.txt', 'r') 
    lines = descriptions_train_file.read()
    # print(lines)
    
    if nouns_only:
        nouns = []
        for word,pos in nltk.pos_tag(nltk.word_tokenize(lines)):
            if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
                nouns.append(word)
    else:
        nouns = nltk.word_tokenize(lines)
    
    words = [w.lower() for w in nouns]
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    words = [stemmer.stem(w) for w in words]
    # words = [lemmatizer.lemmatize(w) for w in words]
    
    return ' '.join(words)

def parse_tag(i):
    tag_train_file = open(tags_train_dir + str(i) + '.txt', 'r') 
    lines = tag_train_file.read()
    if lines == '':
        return []
    tags_pre = lines.rstrip().split('\n')
    tags_pre = [w.split(':')[1] for w in tags_pre]
    
#     tags = []
#     for w in tags_pre:
#         for ww in w.split(' '):
#             tags.append(ww)
#     tags = [stemmer.stem(w) for w in tags]
    return tags_pre
    
print(parse_description(1, True))
print(parse_description(1, False))

bowl soup carrot shrimp noodl food bowl soup carrot shrimp chopstick bowl ramen someon bowl asian noodl soup shrimp carrot
bowl soup carrot shrimp noodl healthi food bowl readi eat soup carrot shrimp sit next chopstick tasti bowl ramen serv someon enjoy bowl asian noodl soup shrimp carrot


# Bag of words

In [110]:
from sklearn.feature_extraction.text import CountVectorizer

training_size = 10000

training_descriptions = []
training_tags = []
tags_set = set()

for i in range(training_size):
    training_descriptions.append(parse_description(i, True))
    training_tags = parse_tag(i)
    tags_set |= set(training_tags)
    
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(training_descriptions)
X = X.toarray()
print( len(vectorizer.get_feature_names()) )

4923


In [111]:
training_tags = []
tags_set = set()

for i in range(training_size):
    tags = parse_tag(i)
    training_tags.append(tags)
    tags_set |= set(tags)
print(len(tags_set))
print(tags_set)

tag2index = {}
idx = 0
for tag in tags_set:
    tag2index[tag] = idx
    idx += 1
    
y = np.zeros((training_size, len(tags_set)))
for i in range(training_size):
    for tag in training_tags[i]:
        y[i][tag2index[tag]] = 1

print(y[0])

80
{'clock', 'refrigerator', 'suitcase', 'hot dog', 'frisbee', 'chair', 'cell phone', 'oven', 'couch', 'giraffe', 'bear', 'bowl', 'cat', 'stop sign', 'parking meter', 'person', 'train', 'bench', 'bottle', 'bed', 'dog', 'microwave', 'book', 'tennis racket', 'banana', 'toaster', 'teddy bear', 'wine glass', 'bird', 'kite', 'potted plant', 'motorcycle', 'pizza', 'hair drier', 'baseball bat', 'keyboard', 'skis', 'sandwich', 'airplane', 'baseball glove', 'carrot', 'broccoli', 'truck', 'toilet', 'fire hydrant', 'bicycle', 'laptop', 'sheep', 'snowboard', 'car', 'cake', 'umbrella', 'cup', 'scissors', 'tie', 'cow', 'tv', 'knife', 'spoon', 'dining table', 'sink', 'traffic light', 'surfboard', 'sports ball', 'bus', 'apple', 'remote', 'elephant', 'horse', 'mouse', 'handbag', 'vase', 'skateboard', 'toothbrush', 'boat', 'donut', 'fork', 'zebra', 'orange', 'backpack'}
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0

# Get top 20

In [112]:
def closest_20_neighbors(target, candid):
    index2dist = {}
    idx = 0
    for c in candid:
        d = np.linalg.norm(target - c)
        index2dist[idx] = d
        idx += 1

    sorted_by_value = sorted(index2dist.items(), key=lambda kv: kv[1])
    keys = []
    for s in sorted_by_value:
        keys.append(s[0])
    return keys[:20]

# Cross validation

In [113]:
from random import shuffle
label = [i for i in range(10000)]
shuffle(label)

X_train = np.zeros((8000, X.shape[1]))
y_train = np.zeros((8000, 80))
X_test = np.zeros((2000, X.shape[1]))
y_test = np.zeros((2000, 80))

y_label = []

for i in range(8000):
    X_train[i] = X[label[i]]
    y_train[i] = y[label[i]]
for i in range(2000):
    X_test[i] = X[label[i + 8000]]
    y_test[i] = y[label[i + 8000]]
    y_label.append(label[i])

# Reduce dimension

In [378]:
from sklearn.decomposition import PCA

pca_components = 1024
pca = PCA(n_components=pca_components)
X_pca = pca.fit_transform(X)
print(X_pca[0])

[ 0.60180886  0.14567559 -1.49503461 ...  0.00944589 -0.02258241
 -0.00586142]


# Training with full set

In [125]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='sgd', alpha=1e-5, max_iter=2000,
                hidden_layer_sizes=(2048, 2048, 1024), random_state=1)
clf.fit(X, y)
print("finished.")

finished.


In [126]:
print(X.shape)
print(y.shape)

(10000, 4923)
(10000, 80)


In [127]:
y_predict = clf.predict(X_test)

idx = 0
score_sum = 0
tops = []
for y_p in y_predict:
    top_20 = closest_20_neighbors(y_p, y_test)
    tops.append(top_20)
    j = 1
    for i in top_20:
        if i == idx:
            # MAP@20
            score_sum += float(21 - j) / 20.0
            break
        j += 1
    idx += 1
    if idx % 100 == 0:
        print(idx)
    
score = score_sum / 2000.0
print(score)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
0.2808999999999999


In [128]:
for i in range(20):
    print(tops[i][:15])

[879, 1568, 9, 104, 198, 329, 497, 634, 828, 1028, 1219, 1244, 1300, 1363, 1537]
[1889, 1239, 1451, 1220, 1790, 945, 786, 523, 1543, 1, 359, 1818, 1912, 1986, 820]
[950, 1240, 2, 582, 1843, 9, 104, 198, 329, 497, 634, 828, 1028, 1219, 1244]
[3, 52, 66, 95, 108, 123, 165, 251, 289, 361, 396, 422, 447, 449, 492]
[9, 104, 198, 329, 497, 634, 828, 1028, 1219, 1244, 1300, 1363, 1537, 1538, 1570]
[5, 8, 84, 115, 373, 386, 444, 453, 461, 518, 559, 658, 690, 693, 827]
[789, 890, 1596, 302, 366, 507, 849, 1416, 782, 1031, 163, 315, 778, 9, 104]
[1139, 302, 366, 507, 849, 1416, 1886, 33, 107, 899, 686, 1007, 1761, 144, 1301]
[5, 8, 84, 115, 373, 386, 444, 453, 461, 518, 559, 658, 690, 693, 827]
[9, 104, 198, 329, 497, 634, 828, 1028, 1219, 1244, 1300, 1363, 1537, 1538, 1570]
[1100, 1488, 1858, 189, 440, 882, 1192, 1383, 1454, 1734, 1830, 1839, 1940, 958, 1204]
[11, 612, 871, 1849, 214, 9, 104, 198, 329, 497, 634, 828, 1028, 1219, 1244]
[1692, 445, 574, 740, 1330, 1837, 1991, 1523, 1842, 1877, 57

In [75]:
word2index = {}

idx = 0
for w in vectorizer.get_feature_names():
    word2index[w] = idx
    idx += 1

print(len(word2index))

4923


In [104]:
descriptions_test_dir = 'data/descriptions_test/'
features_test_dir = 'data/features_test/'
images_test_dir = 'data/images_test/'
tags_test_dir = 'data/tags_test/'

table = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def parse_test_description(i):
    descriptions_test_file = open(descriptions_test_dir + str(i) + '.txt', 'r') 
    lines = descriptions_test_file.read()
    # print(lines)
    
    nouns = []
    for word,pos in nltk.pos_tag(nltk.word_tokenize(lines)):
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            nouns.append(word)
    
    words = [w.lower() for w in nouns]
    words = [w for w in words if not w in stop_words] # strip stopword
    words = [stemmer.stem(w) for w in words] # stem
    
    return words
    
def parse_test_tag(i):
    tag_test_file = open(tags_test_dir + str(i) + '.txt', 'r') 
    lines = tag_test_file.read()
    if lines == '':
        return []
    tags_pre = lines.rstrip().split('\n')
    tags_pre = [w.split(':')[1] for w in tags_pre]

    return tags_pre

test_size = 2000
X_real_test = np.zeros((test_size, len(vectorizer.get_feature_names())))

for i in range(test_size):
    words = parse_test_description(i)
    for w in words:
        if w in word2index:
            X_real_test[i][word2index[w]] += 1    

# X_real_test_pca = pca.transform(X_real_test)


test_tags = []
for i in range(test_size):
    tags = parse_test_tag(i)
    test_tags.append(tags)

y_real_test = np.zeros((test_size, len(tags_set)))
for i in range(test_size):
    for tag in test_tags[i]:
        y_real_test[i][tag2index[tag]] = 1

y_real_test_predict = clf.predict(X_real_test)

In [105]:
idx = 0
res = []
for y_p in y_real_test_predict:
    top_20 = closest_20_neighbors(y_p, y_real_test)
    res.append(top_20)
    idx += 1
    if idx % 100 == 0:
        print(idx)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000


# Output submission

In [107]:
output_filename = 'submission_tags_mlp.csv'
with open(output_filename, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["Descritpion_ID","Top_20_Image_IDs"])
    for i in range(test_size):
        row = []
        row.append(str(i) + '.txt')
        candids = []
        for candid in res[i]:
            candids.append(str(candid) + '.jpg')
        row.append(' '.join(candids))
        writer.writerow(row)