In [2]:
import numpy as np
import pandas as pd
import csv
import nltk
import re

In [53]:
train_size = 10000
v_len = 1000
test_size = 2000

In [54]:
# read in description txt (5 sentences each)
def read_des_train():
    train_path = './data/descriptions_train/' 
    des_train = []
    for i in range(train_size):
        file_name = str(i) + '.txt'
        file_path = train_path + file_name
        des = []
        with open(file_path) as f:
            for line in f.readlines():
                des.append(line.strip('\n'))
        des_train.append(des)
    return des_train

In [55]:
desc_train = read_des_train()

In [59]:
# read in tags
# build diction
def read_tag_train():
    train_path = './data/tags_train/' 
    tag_train = []
    cat_list = []
    sup_cat_list = []
    for i in range(train_size):
        file_name = str(i) + '.txt'
        file_path = train_path + file_name
        tag = ''
        with open(file_path) as f:
            for line in f.readlines():
                sup_cat, sub_cat = line.strip('\n').split(':')
                tag = tag + ' ' + sub_cat
                cat_list.append(sub_cat)
                sup_cat_list.append(sup_cat)
        tag.strip()
        tag_train.append(tag)
    return tag_train, cat_list, sup_cat_list

In [60]:
tags_train, category_list, sup_list = read_tag_train()
category_list = list(set(category_list))
sup_list = list(set(sup_list))

In [7]:
# pre-process 
def preprop_description(desc_data):
    stopwords = nltk.corpus.stopwords.words('english')
    stemmer = nltk.SnowballStemmer("english")
    desc_words_bow = []
    for descs in desc_data:
        temp = ''
        for i in range(len(descs)):
            word_lower = descs[i].lower()
            word_list = re.findall(r"[a-z]+", word_lower)
            word_filter = [stemmer.stem(word) for word in word_list if word not in stopwords]
            for j in range (len(word_filter)):
                temp = temp + ' '+word_filter[j] 
        desc_words_bow.append(temp)
    return desc_words_bow

In [8]:
#dict_bow = preprop_description(desc_train)
desc_word_bow = preprop_description(desc_train)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cv = CountVectorizer(min_df = 15) # frequency threshold
x_bow_train = cv.fit_transform(desc_word_bow)
words_extract = np.array(cv.get_feature_names())
transformer = TfidfTransformer()
x_tf_train = transformer.fit_transform(x_bow_train).toarray()


In [11]:
x_tf_train = pd.DataFrame(x_tf_train, columns = words_extract)

In [12]:
cv = CountVectorizer(vocabulary = category_list)
tags_train_c = cv.fit_transform(tags_train).toarray()

In [13]:
def read_des_test():
    train_path = './data/descriptions_test/' 
    des_test = []
    for i in range(2000):
        file_name = str(i) + '.txt'
        file_path = train_path + file_name
        des = []
        with open(file_path) as f:
            for line in f.readlines():
                des.append(line.strip('\n'))
        des_test.append(des)
    return des_test

desc_test = read_des_test()

In [14]:
desc_test_bow = preprop_description(desc_test) # To do: process BOW 

In [15]:
# process test data

cv = CountVectorizer(vocabulary = words_extract)
x_bow_test = cv.fit_transform(desc_test_bow)
transformer = TfidfTransformer()
x_test_tf = transformer.fit_transform(x_bow_test).toarray()

x_test_tf = pd.DataFrame(x_test_tf, columns = words_extract)



In [None]:
print len(x_tf_train)

In [28]:
# PCA 
from sklearn.decomposition import PCA

pca_pool5 = PCA(n_components=1224, svd_solver='auto') 
pca_pool5.fit(x_tf_train)
train_PCA = pca_pool5.transform(x_tf_train)


In [44]:
test_PCA = pca_pool5.transform(x_test_tf)


In [74]:
# feature train data pool-5

ft_train_data = pd.read_csv(filepath_or_buffer="./data/features_train/features_resnet1000intermediate_train.csv",header=None
)
ft_train_data[0] = ft_train_data[0].apply(lambda x: int(x.split('/')[1].replace('.jpg','')))
ft_train_sort = ft_train_data.sort_values(by=[0])
ft_train = ft_train_sort[list(range(1,2049))].values

In [70]:
print ft_train_data.shape

(10000, 2049)


In [72]:
# feature test data pool-5
ft_test_data = pd.read_csv(filepath_or_buffer="./data/features_test/features_resnet1000intermediate_test.csv",header=None
)
ft_test_data[0] = ft_test_data[0].apply(lambda x: int(x.split('/')[1].replace('.jpg','')))
ft_test = ft_test_data[list(range(1,2049))].values

In [37]:
from sklearn.cross_decomposition import PLSRegression

# Partial Least Squares Regression 
pls_p5 = PLSRegression(n_components=1224) # 2048 for future approach
pls_p5.fit(train_PCA, ft_train)


PLSRegression(copy=True, max_iter=500, n_components=1224, scale=True,
       tol=1e-06)

In [46]:
from sklearn.neighbors import NearestNeighbors
nbs = NearestNeighbors(n_neighbors=20, metric='cosine').fit(ft_test)

In [48]:
# result 
predict_pls = []
for i in range(2000):
    predict = pls_p5.predict(np.array([test_PCA[i]]))
    distance, idx = nbs.kneighbors(predict) #success here
    predict_pls.append(idx[0].tolist())

In [48]:
# Ramdom Forest

# from sklearn.ensemble import RandomForestRegressor
# from sklearn import preprocessing

# predict_tags = []
# for i in range(80):
#     print i
#     y = tags_train_c[:,i]
#     clf = RandomForestRegressor()
#     clf.fit(x_tf_train, y)
#     tag = clf.predict(x_test_tf)
#     predict_tags.append(tag)
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


In [None]:
# linear regression approach

lr_predict_tags = [] 
for i in range(80):
    print i
    y = tags_train_c[:,i]
    clf = LinearRegression(n_jobs = -1) 
    clf.fit(x_tf_train, y)
    tag = clf.predict(x_test_tf)
    lr_predict_tags.append(tag)

In [None]:
#from sklearn import preprocessing
#train_p = preprocessing.normalize(train_feature, norm='l1')

In [49]:
# print len(predict_tags)

80


In [50]:
# predict_tags = np.array(predict_tags)
# predict_tags_t = predict_tags.T


In [51]:
# def read_tag_test():
#     train_path = './data/tags_test/' 
#     tag_test = []
#     for i in range(2000):
#         file_name = str(i) + '.txt'
#         file_path = train_path + file_name
#         tag = ''
#         with open(file_path) as f:
#             for line in f.readlines():
#                 sup_cat, sub_cat = line.strip('\n').split(':')
#                 tag = tag + ' ' + sub_cat
#         tag.strip()
#         tag_test.append(tag)
#     return tag_test #, cat_list

# tags_test = read_tag_test()

In [52]:
# cv = CountVectorizer(vocabulary = category_list)
# tags_test_c = cv.fit_transform(tags_test)


In [53]:
# # knn

# from sklearn.neighbors import NearestNeighbors as KNN

# label_test = []
# for i in range(2000):
#     label_test.append(i)
# knn = KNN(n_neighbors = 20)
# knn = knn.fit(tags_test_c, label_test)
# prediction = knn.kneighbors(predict_tags_t, return_distance = False)”“”


In [50]:
img_list = []
for row in (predict_pls):
    img_num = ''
    for val in row:
        img_num = img_num + ' ' + (str(val) + ".jpg")
    img_list.append(img_num)
idx = []
for i in range(2000):
    idx.append(str(i)+ ".txt")

In [51]:
# output to csv file
images = pd.DataFrame(img_list, columns = ["Top_20_Image_IDs"])
idx = pd.DataFrame(idx, columns = ["Descritpion_ID"])
result = pd.concat([idx, images], axis=1)
result.to_csv("pls_pcaP5_result.csv", index = False)