In [1]:
import os, csv, random, re, nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from gensim.models import KeyedVectors
nltk.download("punkt")
nltk.download("wordnet")

ModuleNotFoundError: No module named 'gensim'

In [None]:
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
print("Loaded word vectors successfully!")

In [None]:
stop_words = set()
with open('stops.txt', 'r') as f:
    for line in f:
        stop_words.add(line.strip())
total_train = 10000
total_test = 2000
num_train = 1000
num_dev = 0
num_test = 2000
split_idx = list(range(total_train))
random.shuffle(split_idx)
train_idx = split_idx[:num_train]
dev_idx = split_idx[num_train:(num_train+num_dev)]
test_idx = list(range(total_test))

In [None]:
def prefixReplace(match):
    prefix, stem = match.group(1), match.group(2)
    temp = prefix + stem
    if not stem in stop_words:
        temp += ' ' + stem
    return temp

def hyphenReplace(match):
    temp = match.group()
    li = temp.split('-')
    temp = temp.replace('-', '')
    for item in li:
        if not item in stop_words:
            temp += ' ' + item
    return temp

In [None]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    wnl = WordNetLemmatizer()
    p_prefix = re.compile(r'\b(a|an|ante|anti|auto|circum|co|com|con|contra|contro|de|dis|en|em|ex|extra|fore|hetero|homo|homeo|hyper|il|im|in|ir|inter|intra|intro|macro|micro|mid|mis|mono|non|omni|over|post|pre|pro|re|semi|sub|super|sym|syn|trans|tri|un|under|uni)-([a-z])+\b', re.I)
    p_hyphen = re.compile(r'\b(\w+-)+\w+\b')
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        li = []
        with open(path) as f:
            for line in f:
                line = line.strip()
                line = line.lower()
                # expand stem with hyphen prefix
                line = p_prefix.sub(prefixReplace, line)
                # expand hyphenated word
                line = p_hyphen.sub(hyphenReplace, line)
                line = line.replace(':', ' ')
                line = line.replace('\'s', '')
                line = line.replace(',', ' ')
                line = line.replace('.', ' ')
                li += word_tokenize(line)
                li = [wnl.lemmatize(x) for x in li]
        docs.append(li)
    return docs

def doc_to_vec(li, word2vec):
    # get list of word vectors in sentence
    word_vecs = []
    for w in li:
        if w not in word2vec.vocab:
#             print('not found:',w)
            pass
        elif w in stop_words:
#             print('stop word:', w)
            pass
        else:
            word_vecs.append(word2vec.get_vector(w))
    if not li:
        return np.zeros(300)
    # return average
    return np.stack(word_vecs).mean(0)


In [None]:
# build description matrices
d_train_dev = parse_descriptions("data/descriptions_train", num_doc=total_train)
test_desc = parse_descriptions("data/descriptions_test", num_doc=total_test)
d_train = np.array([doc_to_vec(d_train_dev[i], word2vec) for i in train_idx])
d_dev = np.array([doc_to_vec(d_train_dev[i], word2vec) for i in dev_idx])
d_test = np.array([doc_to_vec(test_desc[i], word2vec) for i in test_idx])
d_train_dev.clear()
test_desc.clear()
del d_train_dev, test_desc

In [None]:
print("Built all description matrices!")
print("d_train shape:", d_train.shape)
print("d_dev shape:", d_dev.shape)
print("d_test shape:", d_test.shape)

In [None]:
# build test matrices
t_train_dev = parse_descriptions("data/tags_train", num_doc=total_train)
test_tag = parse_descriptions("data/tags_test", num_doc=total_test)
t_train = np.array([doc_to_vec(t_train_dev[i], word2vec) for i in train_idx])
t_dev = np.array([doc_to_vec(t_train_dev[i], word2vec) for i in dev_idx])
t_test = np.array([doc_to_vec(test_tag[i], word2vec) for i in test_idx])

t_train_dev.clear()
test_tag.clear()
del t_train_dev, test_tag

In [None]:
print("Built all tag matrices!")
print("t_train shape:", t_train.shape)
print("t_dev shape:", t_dev.shape)
print("t_test shape:", t_test.shape)

In [None]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build feature matrices
# p = np.random.randn(1000, 200)
f_train_dev = parse_features("data/features_train/features_resnet1000_train.csv")# @ p
f_train = f_train_dev[train_idx]
f_dev = f_train_dev[dev_idx]
f_test = parse_features("data/features_test/features_resnet1000_test.csv")# @ p
f_test = f_test[test_idx]

del f_train_dev

In [None]:
print("Built all feature vec matrices!")
print("f_train shape:", f_train.shape)
print("f_dev shape:", f_dev.shape)
print("f_test shape:", f_test.shape)

In [None]:
import heapq

def l2_distance(x1, x2): # euclidean distance
    return np.linalg.norm(x1 - x2)

def l1_distance(x1, x2): # manhattan distance
    return sum(abs(x1 - x2))

def build_y(vecs, closest_num, other_num):
    y = []
    for v1 in vecs:
        i = 0
        li = []
        for v2 in vecs[:closest_num,:]:
            heapq.heappush(li, (-l2_distance(v1, v2), i))
            i += 1
        for v2 in vecs[closest_num:,:]:
            heapq.heappushpop(li, (-l2_distance(v1, v2), i))
            i += 1
        y += [(-d, i) for d, i in li]
        s = set(range(len(vecs)))
        for d, i in li:
            s.remove(i)
        for x in random.sample(s, other_num):
            y.append((l2_distance(v1, vecs[x]), x))
    return [a[0] for a in y], [a[1] for a in y]

def build_x(v1, v2, num, idx):
    x = []
    for i in range(len(idx)):
        x.append(np.concatenate([v1[i // num],v2[idx[i]]]))
    return x

closest_num = 10
other_num = 10
y_train, idx = build_y(f_train, closest_num, other_num)
y_train = np.array(y_train)
x_train = build_x(d_train, np.concatenate((f_train, t_train), axis=1), closest_num + other_num, idx)
x_train = np.array(x_train)

idx.clear()
del idx

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, n_jobs=-1)
rf.fit(x_train, y_train)

In [None]:
# # test performance on development set
# dev_scores = []
# dev_pos_list = []
# for i in range(num_dev):
#     x_dev = []
#     for j in range(num_dev):
#         x_dev.append(np.concatenate((d_dev[i],f_dev[j], t_dev[j])))
#     y_dev_pred = rf.predict(x_dev)
#     pred_dist_idx = list(np.argsort(y_dev_pred))
#     dev_pos = pred_dist_idx.index(i)
#     dev_pos_list.append(dev_pos)
#     if dev_pos < 20:
#         dev_scores.append(1 / (dev_pos + 1))
#     else:
#         dev_scores.append(0.0)

# print("Development MAP@20:", np.mean(dev_scores))
# print("Mean index of true image", np.mean(dev_pos_list))
# print("Median index of true image", np.median(dev_pos_list))

In [None]:
f = open("solution1_submission.csv", "w")
f.write("Descritpion_ID,Top_20_Image_IDs\n")
for i in range(num_test):
    x_test = []
    for j in range(num_test):
        x_test.append(np.concatenate((d_test[i],f_test[j],t_test[j])))
    y_test_pred = rf.predict(x_test)
    test_dist_idx = list(np.argsort(y_test_pred))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in top_20]
    f.write("%d.txt,%s\n" % (i, " ".join(row)))

print("Output written!")