In [1]:
import os, csv, random, re, nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from gensim.models import KeyedVectors
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
print("Loaded word vectors successfully!")

Loaded word vectors successfully!


In [3]:
stop_words = set()
with open('stops.txt', 'r') as f:
    for line in f:
        stop_words.add(line.strip())
total_train = 10000
total_test = 2000
num_train = 10000
num_dev = 0
num_test = 2000
split_idx = list(range(total_train))
random.shuffle(split_idx)
train_idx = split_idx[:num_train]
dev_idx = split_idx[num_train:(num_train+num_dev)]
test_idx = list(range(num_test))

In [4]:
def prefixReplace(match):
    prefix, stem = match.group(1), match.group(2)
    temp = prefix + stem
    if not stem in stop_words:
        temp += ' ' + stem
    return temp

def hyphenReplace(match):
    temp = match.group()
    li = temp.split('-')
    temp = temp.replace('-', '')
    for item in li:
        if not item in stop_words:
            temp += ' ' + item
    return temp

In [5]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    wnl = WordNetLemmatizer()
    p_prefix = re.compile(r'\b(a|an|ante|anti|auto|circum|co|com|con|contra|contro|de|dis|en|em|ex|extra|fore|hetero|homo|homeo|hyper|il|im|in|ir|inter|intra|intro|macro|micro|mid|mis|mono|non|omni|over|post|pre|pro|re|semi|sub|super|sym|syn|trans|tri|un|under|uni)-([a-z])+\b', re.I)
    p_hyphen = re.compile(r'\b(\w+-)+\w+\b')
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        li = []
        with open(path) as f:
            for line in f:
                line = line.strip()
                line = line.lower()
                # expand stem with hyphen prefix
                line = p_prefix.sub(prefixReplace, line)
                # expand hyphenated word
                line = p_hyphen.sub(hyphenReplace, line)
                line = line.replace(':', ' ')
                line = line.replace('\'s', '')
                line = line.replace(',', ' ')
                line = line.replace('.', ' ')
                li += word_tokenize(line)
                li = [wnl.lemmatize(x) for x in li]
        docs.append(li)
    return docs

def doc_to_vec(li, word2vec):
    # get list of word vectors in sentence
    word_vecs = []
    for w in li:
        if w not in word2vec.vocab:
#             print('not found:',w)
            pass
        elif w in stop_words:
#             print('stop word:', w)
            pass
        else:
            word_vecs.append(word2vec.get_vector(w))
    if not li:
        return np.zeros(300)
    # return average
    return np.stack(word_vecs).mean(0)


In [6]:
# build description matrices
d_train_dev = parse_descriptions("data/descriptions_train", num_doc=total_train)
test_desc = parse_descriptions("data/descriptions_test", num_doc=total_test)
d_train = np.array([doc_to_vec(d_train_dev[i], word2vec) for i in train_idx])
d_dev = np.array([doc_to_vec(d_train_dev[i], word2vec) for i in dev_idx])
d_test = np.array([doc_to_vec(test_desc[i], word2vec) for i in test_idx])
d_train_dev.clear()
test_desc.clear()
del d_train_dev, test_desc

In [7]:
print("Built all description matrices!")
print("d_train shape:", d_train.shape)
print("d_dev shape:", d_dev.shape)
print("d_test shape:", d_test.shape)

Built all description matrices!
d_train shape: (10000, 300)
d_dev shape: (0,)
d_test shape: (2000, 300)


In [8]:
# build test matrices
t_train_dev = parse_descriptions("data/tags_train", num_doc=total_train)
test_tag = parse_descriptions("data/tags_test", num_doc=total_test)
t_train = np.array([doc_to_vec(t_train_dev[i], word2vec) for i in train_idx])
t_dev = np.array([doc_to_vec(t_train_dev[i], word2vec) for i in dev_idx])
t_test = np.array([doc_to_vec(test_tag[i], word2vec) for i in test_idx])

t_train_dev.clear()
test_tag.clear()
del t_train_dev, test_tag

In [9]:
print("Built all tag matrices!")
print("t_train shape:", t_train.shape)
print("t_dev shape:", t_dev.shape)
print("t_test shape:", t_test.shape)

Built all tag matrices!
t_train shape: (10000, 300)
t_dev shape: (0,)
t_test shape: (2000, 300)


In [10]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build feature matrices
# p = np.random.randn(1000, 200)
f_train_dev = parse_features("data/features_train/features_resnet1000_train.csv")# @ p
f_train = f_train_dev[train_idx]
f_dev = f_train_dev[dev_idx]
f_test = parse_features("data/features_test/features_resnet1000_test.csv")# @ p
f_test = f_test[test_idx]

del f_train_dev

In [11]:
print("Built all feature vec matrices!")
print("f_train shape:", f_train.shape)
print("f_dev shape:", f_dev.shape)
print("f_test shape:", f_test.shape)

Built all feature vec matrices!
f_train shape: (10000, 1000)
f_dev shape: (0, 1000)
f_test shape: (2000, 1000)


In [12]:
fi_train_dev = parse_features("data/features_train/features_resnet1000intermediate_train.csv")
fi_train = fi_train_dev[train_idx]
fi_dev = fi_train_dev[dev_idx]
fi_test = parse_features("data/features_test/features_resnet1000intermediate_test.csv")# @ p
fi_test = fi_test[test_idx]

del fi_train_dev

In [13]:
print("Built all intermediate feature vec matrices!")
print("fi_train shape:", fi_train.shape)
print("fi_dev shape:", fi_dev.shape)
print("fi_test shape:", fi_test.shape)

Built all intermediate feature vec matrices!
fi_train shape: (10000, 2048)
fi_dev shape: (0, 2048)
fi_test shape: (2000, 2048)


In [14]:
x_train = np.concatenate((d_train, f_train, t_train, fi_train), axis=1)
y_train = d_train

In [15]:
print(x_train.shape, y_train.shape)

(10000, 3648) (10000, 300)


In [16]:
# from sklearn.linear_model import LinearRegression
# lin = LinearRegression(n_jobs=-1)
# lin.fit(x_train, y_train)

In [17]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=20, n_jobs=-1)
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
x_test = np.concatenate((d_test, f_test, t_test, fi_test), axis=1)
y_test_pred = rf.predict(x_test)

In [19]:
from scipy.spatial.distance import cdist
distance = cdist(d_test, y_test_pred, 'euclidean')

In [20]:
f = open("solution2_submission.csv", "w")
f.write("Descritpion_ID,Top_20_Image_IDs\n")
for i in range(num_test):
    test_dist_idx = list(np.argsort(distance[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in top_20]
    f.write("%d.txt,%s\n" % (i, " ".join(row)))
f.close()
print("Output written!")

Output written!
