In [13]:
import numpy as np
import pandas as pd
import cv2
import os
from sklearn.cluster import KMeans
from scipy.spatial import distance
from IPython.display import Image

In [11]:
def load_images_from_folder(folder,scale = 0.2):
    images = {}
    for filename in ('book', 'no_book'):
        category = []
        path = folder + "/" + filename
        for cat in os.listdir(path):
            img = cv2.imread(path + "/" + cat,0)
            if img is not None:
                img = cv2.resize(img, (960, 1280))
                category.append(img)
        images[filename] = category
    return images

In [12]:
images = load_images_from_folder('train')

In [6]:
def sift_features(images):
    sift_vectors = {}
    descriptor_list = []
    sift = cv2.SIFT_create()
    for key,value in images.items():
        features = []
        for img in value:
            kp, des = sift.detectAndCompute(img,None)
           
            if not des is None: 
                descriptor_list.extend(des)
            features.append(des)
        sift_vectors[key] = features
    return [descriptor_list, sift_vectors]

In [16]:
sifts = sift_features(images) 
descriptor_list = sifts[0] 
all_bovw_feature = sifts[1] 

In [18]:
len(descriptor_list)

541944

In [9]:
def kmeans(k, descriptor_list):
    kmeans = KMeans(n_clusters = k, n_init=10)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words

In [19]:
visual_words = kmeans(150, descriptor_list) 

In [20]:
np.save('centers.npy', visual_words)

In [21]:
visual_words.shape

(150, 128)

In [22]:
def find_index(feature, centers):
    return np.argmin(np.sum((centers - feature) **2, axis = 1))

In [23]:
def image_class(all_bovw, centers):
    dict_feature = {}
    for key,value in all_bovw.items():
        category = []
        for img in value:
            histogram = np.zeros(len(centers))
            if not img is None:
                for each_feature in img:
                    ind = find_index(each_feature, centers)
                    histogram[ind] += 1
            category.append(histogram)
        dict_feature[key] = category
    return dict_feature

In [24]:
bovw_train = image_class(all_bovw_feature, visual_words)

In [26]:
len(bovw_train['no_book'])

74

In [27]:
columns = ['x' + str(i + 1) for i in range(150)]

In [28]:
train_array = bovw_train['book']
train_array.extend(bovw_train['no_book'])
train_array = np.array(train_array)
train_array.shape

(154, 150)

In [29]:
train = pd.DataFrame(train_array, columns = columns)

In [30]:
train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150
0,16.0,26.0,29.0,7.0,21.0,26.0,6.0,10.0,23.0,27.0,...,7.0,18.0,16.0,23.0,19.0,14.0,7.0,25.0,13.0,14.0
1,22.0,24.0,27.0,14.0,14.0,19.0,108.0,35.0,21.0,21.0,...,11.0,17.0,9.0,12.0,21.0,17.0,10.0,23.0,33.0,18.0
2,28.0,27.0,34.0,15.0,17.0,19.0,117.0,40.0,31.0,18.0,...,20.0,16.0,7.0,24.0,17.0,16.0,7.0,32.0,27.0,11.0
3,5.0,5.0,5.0,1.0,8.0,1.0,10.0,4.0,6.0,15.0,...,1.0,4.0,2.0,5.0,6.0,5.0,0.0,4.0,3.0,3.0
4,12.0,10.0,16.0,7.0,7.0,5.0,0.0,16.0,24.0,27.0,...,4.0,18.0,17.0,4.0,11.0,13.0,1.0,23.0,12.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,7.0,6.0,3.0,8.0,2.0,12.0,0.0,4.0,1.0,0.0,...,0.0,4.0,9.0,1.0,2.0,2.0,0.0,1.0,3.0,3.0
150,79.0,164.0,25.0,33.0,148.0,55.0,7.0,24.0,38.0,15.0,...,24.0,3.0,71.0,48.0,36.0,105.0,15.0,30.0,64.0,50.0
151,9.0,5.0,3.0,5.0,7.0,13.0,5.0,4.0,6.0,5.0,...,1.0,5.0,1.0,2.0,3.0,9.0,0.0,8.0,9.0,7.0
152,10.0,7.0,9.0,5.0,4.0,5.0,2.0,1.0,3.0,3.0,...,2.0,2.0,4.0,11.0,10.0,1.0,0.0,3.0,5.0,2.0


In [31]:
y = [1 for i in range(80)]
y.extend([0 for i in range(74)])

In [32]:
y = pd.Series(y, dtype=int)

In [33]:
train['y'] = y

In [34]:
train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x142,x143,x144,x145,x146,x147,x148,x149,x150,y
0,16.0,26.0,29.0,7.0,21.0,26.0,6.0,10.0,23.0,27.0,...,18.0,16.0,23.0,19.0,14.0,7.0,25.0,13.0,14.0,1
1,22.0,24.0,27.0,14.0,14.0,19.0,108.0,35.0,21.0,21.0,...,17.0,9.0,12.0,21.0,17.0,10.0,23.0,33.0,18.0,1
2,28.0,27.0,34.0,15.0,17.0,19.0,117.0,40.0,31.0,18.0,...,16.0,7.0,24.0,17.0,16.0,7.0,32.0,27.0,11.0,1
3,5.0,5.0,5.0,1.0,8.0,1.0,10.0,4.0,6.0,15.0,...,4.0,2.0,5.0,6.0,5.0,0.0,4.0,3.0,3.0,1
4,12.0,10.0,16.0,7.0,7.0,5.0,0.0,16.0,24.0,27.0,...,18.0,17.0,4.0,11.0,13.0,1.0,23.0,12.0,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,7.0,6.0,3.0,8.0,2.0,12.0,0.0,4.0,1.0,0.0,...,4.0,9.0,1.0,2.0,2.0,0.0,1.0,3.0,3.0,0
150,79.0,164.0,25.0,33.0,148.0,55.0,7.0,24.0,38.0,15.0,...,3.0,71.0,48.0,36.0,105.0,15.0,30.0,64.0,50.0,0
151,9.0,5.0,3.0,5.0,7.0,13.0,5.0,4.0,6.0,5.0,...,5.0,1.0,2.0,3.0,9.0,0.0,8.0,9.0,7.0,0
152,10.0,7.0,9.0,5.0,4.0,5.0,2.0,1.0,3.0,3.0,...,2.0,4.0,11.0,10.0,1.0,0.0,3.0,5.0,2.0,0


In [35]:
train.to_csv('train.csv')