In [5]:
%matplotlib inline

import dill

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [6]:
dill.load_session('tag_matching.db')

In [95]:
# constants
n_train = 10000
n_test = 2000
top_k = 20

In [61]:
# word preprocessing
import re
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords

class Preprocess(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        # self.stopList = set(stopwords.words("english"))
        self.stopList = set([word.encode('ascii', 'ignore') for word in stopwords.words('english')])
        
    def preprocess(self, string, n_gram=1):
        
        # replace special character with space
        string = re.sub(r'[^a-zA-Z0-9 ]', r' ', string).encode('ascii', 'ignore')
        
        # Lemmatization (handles capitalization), ignoring stop word
        # turn output to ASCII and ignore special character
        ans = [self.stemmer.stem(word).encode('ascii', 'ignore') for word in string.split()]
        ans = set(word for word in ans if word not in self.stopList)
        
        return ans

In [65]:
# load descriptions
descriptions_test = [set()] * n_test
processor = Preprocess()
for i in range(n_test):
    with open('data/descriptions_test/' + str(i) + '.txt') as f:
        words = f.read() # readlines()
        descriptions_test[i] = processor.preprocess(words)

In [67]:
print(descriptions_test[1])

set(['plate', 'slice', 'angel', 'sit', 'food', 'top', 'upon', 'piec', 'yellow', 'small', 'larg', 'contain', 'cake', 'spong', 'quarter', 'slab', 'floweri'])


In [88]:
# load tags
tags_test = []
for i in range(n_test):
    with open('data/tags_test/' + str(i) + '.txt') as f:
        lines = f.readlines()
        tags = []
        for line in lines:
            # print(tag.split(':')[1].rstrip())
            tags.append(line.split(':')[1].rstrip())
        tags_test.append(tags)

In [90]:
print(tags_test[0])

['bed', 'backpack', 'suitcase', 'tie']


In [117]:
# simply use tag matching
def tag_matching(descriptions, tags, top_k):
    scores = np.zeros(len(tags))
    for i in range(len(tags)):
        for tag in tags[i]:
            if tag in descriptions: 
                scores[i] += 1
                # print(tag, descriptions)
        scores[i] /= len(tags[i])+1
    # print(scores.sum())
    rank = np.argsort(scores)[::-1]
    return rank[:top_k]


ans = tag_matching(descriptions_test[200], tags_test, top_k)
print(ans)
print(descriptions_test[0])
print(tags_test[ans[0]])

[1619 1697 1295 1784 1501  875  801 1372   65 1485 1590  613 1861 1091
 1081 1851  350  796 1793 1908].jpg
set(['blue', 'cell', 'entranc', 'woman', 'planter', 'front', 'side', 'watch', 'walk', 'past', 'phone', 'doorway', 'street', 'black', 'wear', 'along', 'sidewalk', 'check', 'talk', 'build'])
['vase']


In [121]:
# do prediction on all
test_predict = np.empty((n_test, top_k))
for i in range(n_test):
    ans = tag_matching(descriptions_test[i], tags_test, top_k)
    test_predict[i] = ans
print(test_predict)

[[1999.  670.  657. ...  672.  673.  674.]
 [1341. 1529. 1156. ...  457. 1812.  956.]
 [1999.  670.  657. ...  672.  673.  674.]
 ...
 [1026.  897. 1135. ... 1555. 1986.  859.]
 [1999.  670.  657. ...  672.  673.  674.]
 [1106. 1167.  405. ... 1925.  464.  926.]]


In [129]:
# convert prediction to '0.jpg'
test_predict_str = [None] * n_test
for i in range(n_test):
    res = ' '.join([str(int(x)) + '.jpg' for x in test_predict[i]])
    test_predict_str[i] = res # ' '.join([str(int(x)) + '.jpg' for x in test_predict[i]])

In [8]:
# write to csv
df = pd.DataFrame(data=test_predict_str)
df.index = [str(x) + '.txt' for x in range(n_test)]
df.to_csv('./tag_matching.csv', mode='w', index=True, index_label='Descritpion_ID', header=['Top_20_Image_IDs'])


In [7]:
dill.dump_session('tag_matching.db')