In [1]:
import numpy as np
from scipy.spatial import distance
from IPython.display import Image
import os
import csv
import random
import gensim
from sklearn.linear_model import LinearRegression

word2vec = gensim.models.KeyedVectors.load_word2vec_format("./data/GoogleNews-vectors-negative300.bin.gz", binary=True)
print("Loaded word vectors successfully!")


Loaded word vectors successfully!


In [116]:
num_train = 1000

In [117]:
## get the vector representatation of description

def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append([i,f.read()])
    return docs

def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    # return average
    tmp = np.stack(word_vecs)
    return tmp.mean(0)

desc_train = parse_descriptions("./data/descriptions_train", num_doc=num_train)
desc_vector_train = []
for i in range(len(desc_train)):
    tmp = doc_to_vec(desc_train[i][1], word2vec)
    #print(tmp)
    desc_vector_train.append([desc_train[i][0],tmp])
    
print("the length of desc_vector:",len(desc_vector_train))
print("desc_vector[0] is index; desc_vector[1] is vector")
print(len(desc_vector_train[3][1]))

the length of desc_vector: 1000
desc_vector[0] is index; desc_vector[1] is vector
300


In [118]:
## get the feature vector 

#read feature vectors
f = open('./data/features_train/features_resnet1000_train.csv', 'r')

dimension_of_feature_vector =100

# i = 0;
feature_vector_train = []
for line in f:
    tmp = line.strip().split(',')
    index = int(tmp[0][13:-4])
    vector = [float(i) for i in tmp[1:]]
    feature_vector_train.append([index,vector[0:dimension_of_feature_vector]])
    #print(feature_vector_train[-1])
#     i=i+1
#     if i>= num_train:
#         break
f.close()

# sort feature_vector_train accroding to index
def take_index(elem):
    return[elem[0]]

#print(feature_vector_train[3][0])
feature_vector_train.sort(key=take_index)

feature_vector_train = feature_vector_train[:num_train]

print("the length of feature_vector_train is:", len(feature_vector_train))
print("feature_vector_train[0] is index; feature_vector_train[1] is the vector")
print(feature_vector_train[3][1])

the length of feature_vector_train is: 1000
feature_vector_train[0] is index; feature_vector_train[1] is the vector
[1.642971396446228, -0.5919864773750305, -2.0084922313690186, -1.6832419633865356, 0.1523834466934204, -0.5803146362304688, -2.876101493835449, -2.0010290145874023, -2.138638496398926, -0.8315469622612, -0.43401849269866943, -1.1083484888076782, -1.7107912302017212, -2.6106507778167725, 1.337751865386963, -1.6990853548049927, -2.0171265602111816, -1.9067765474319458, -1.197595238685608, -2.3860580921173096, 0.7725794911384583, 0.10497377067804337, 0.8021477460861206, -0.04659649357199669, -1.7470080852508545, -0.37388086318969727, -0.5517594814300537, -0.04499737173318863, -1.6107257604599, -0.6461459994316101, -2.223390817642212, -1.167757272720337, -0.374008446931839, -1.603108286857605, 0.22323819994926453, -1.2288230657577515, 0.560768187046051, -1.068021535873413, -1.039835810661316, -1.387953281402588, -2.279953956604004, -2.242109775543213, -2.6766748428344727, -3.

In [119]:
## get the euclidean distance between feature vector of i and j
## euclidean distance can work as the dissimilarity score

feature_dist_train = []

count = 0
for i in range(len(feature_vector_train)):
    for j in range(len(feature_vector_train)):
        count = count +1;
        if count == 10000:
            print("#",end='')
            count = 0

        dist = distance.euclidean(feature_vector_train[i][1],feature_vector_train[j][1])
        #i,j,index
        #print(i,j)
        #print(feature_vector_train[i][0],feature_vector_train[j][0])
        feature_dist_train.append([feature_vector_train[i][0],feature_vector_train[j][0],dist])

print("the length of feature_dist_train: ", len(feature_dist_train))
print(feature_dist_train[3])

####################################################################################################the length of feature_dist_train:  1000000
[0, 3, 19.260680431729618]


In [120]:
## get the x_train
x_train = []
for i in range(len(desc_vector_train)):
    for j in range(len(feature_vector_train)):
        tmp = desc_vector_train[i][1].copy()
        tmp =tmp.tolist()
        x_train.append([desc_vector_train[i][0],feature_vector_train[j][0],tmp])

print("the length of x_train is: ",len(x_train))

the length of x_train is:  1000000


In [121]:
## get test description vector
desc_test = parse_descriptions("./data/descriptions_test", num_doc=10)
desc_vector_test = []
for i in range(len(desc_test)):
    tmp = doc_to_vec(desc_test[i][1], word2vec)
    #print(tmp)
    desc_vector_test.append([desc_test[i][0],tmp])

    
## get test feature vector
f = open('./data/features_test/features_resnet1000_test.csv', 'r')
feature_vector_test = []
for line in f:
    tmp = line.strip().split(',')
    index = int(tmp[0][12:-4])
    vector = [float(i) for i in tmp[1:]]
    feature_vector_test.append([index,vector[0:dimension_of_feature_vector]])
    #print(feature_vector_train[-1])
f.close()

# sort feature_vector_test accroding to index
def take_index(elem):
    return[elem[0]]

#print(feature_vector_train[3][0])
feature_vector_test.sort(key=take_index)



## get x_test
x_test = []
for i in range(len(desc_vector_test)):
    for j in range(len(feature_vector_test)):
        tmp = desc_vector_test[i][1].copy()
        tmp =tmp.tolist()
        x_test.append([desc_vector_test[i][0],feature_vector_test[j][0],tmp])



In [122]:

x_train_ = [a[2] for a in x_train]
feature_dist_train_ = [b[2] for b in feature_dist_train]
x_test_ = [c[2] for c in x_test]


print(len(x_train_))
print(len(feature_dist_train_))
print(len(x_test_))


print(len(x_test_[0]))

1000000
1000000
20000
300


In [123]:
# ## linear regression
reg = LinearRegression().fit(x_train_, feature_dist_train_)
dissimilarity_score = reg.predict(x_test_)


In [144]:
print(min(dissimilarity_score[0:2000]))
print(max(dissimilarity_score[0:2000]))

19.23949902744832
19.23949902744832


In [136]:
print(dissimilarity_score[0:2000])

[19.23949903 19.23949903 19.23949903 ... 19.23949903 19.23949903
 19.23949903]
