In [1]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.svm import SVC

In [2]:
def features(image, sift):
    kp, des = sift.detectAndCompute(image,None)
    return des

In [3]:
def describe(image_path, extractor):
    descriptor_list = []
    images_list = []
    for image in os.listdir(image_path):
        image = cv2.imread(image_path + str(image), 0)
        descriptor = features(image, extractor)
        images_list.append(image)
        descriptor_list.append(descriptor)
    
    return descriptor_list, images_list

In [4]:
def desc_dictionary(descriptor_list):
    des_list = []
    for i in descriptor_list:
        if i is not None:
            for j in range(len(i)):
                if i[j] is not None:
                    for k in range(len(i[j])):
                        des_list.append(i[j][k])
                
    return des_list

In [5]:
def build_histogram(descriptor, kmeans, size):
    hist = [0]*size
    pred = kmeans.predict(descriptor)
    for p in pred:
        hist[p] += 1
    return hist

In [6]:
def hist_list(image_desc, sift, kmeans, class_id, size):
    hist = build_histogram(image_desc, kmeans, size)
    hist.append(class_id)
    return hist

In [7]:
dataset_dir = "Dataset/"
size = 20
sift = cv2.xfeatures2d.SIFT_create()
kmeans = KMeans(n_clusters = size, n_jobs=1)

In [8]:
def create_bag(given_dir, sift):
    class_image_desc = []
    for root, dirs, files in os.walk(given_dir):
        for dir in dirs:
            desc, images = describe(given_dir + dir + '/', sift)
            class_image_desc.append(desc)
            
    return class_image_desc

In [9]:
def train_data(given_dir, kmeans, sift, size):
        class_image_desc = create_bag(given_dir, sift)
        desc_bag = desc_dictionary(class_image_desc)
        
        kmeans.fit(desc_bag)
        
        dataset = create_dataset(class_image_desc, kmeans, sift, size)
        
        return dataset

In [10]:
def create_dataset(class_image_desc, kmeans, sift, size):
    dataset = []
    
    for label in range(len(class_image_desc)):
        for image_desc in class_image_desc[label]: 
            dataset.append(hist_list(image_desc, sift, kmeans, label, size))
            
    return pd.DataFrame(dataset)

In [13]:
dataset = train_data(dataset_dir, kmeans, sift, size)

In [15]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,25,22,40,17,18,24,35,30,27,11,...,48,21,24,15,52,44,32,31,30,0
1,32,21,7,61,41,21,94,23,99,48,...,82,26,25,11,11,18,17,213,14,0
2,20,36,41,18,18,34,36,40,51,27,...,57,38,35,30,38,20,31,47,28,0
3,45,16,19,15,19,72,19,34,20,17,...,25,29,39,49,29,24,31,34,26,0
4,34,29,41,15,23,38,44,42,45,15,...,56,31,40,30,56,26,29,45,26,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,28,20,25,32,15,31,11,20,21,27,...,14,34,32,28,25,20,39,11,14,1
175,27,49,20,50,48,35,34,57,95,42,...,60,49,72,55,24,53,49,14,36,1
176,8,3,6,1,6,2,2,7,3,0,...,2,9,10,7,6,28,5,1,3,1
177,3,10,20,12,12,10,2,9,5,6,...,3,12,15,13,12,24,14,4,2,1


In [16]:
dataset = shuffle(dataset)

In [18]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
165,75,87,70,147,55,57,103,132,154,155,...,83,140,86,88,133,110,114,131,44,1
177,3,10,20,12,12,10,2,9,5,6,...,3,12,15,13,12,24,14,4,2,1
4,34,29,41,15,23,38,44,42,45,15,...,56,31,40,30,56,26,29,45,26,0
170,59,83,66,75,60,58,80,85,169,47,...,142,77,72,55,59,67,59,82,65,1
173,87,106,27,159,96,70,39,153,78,251,...,132,132,124,75,34,96,113,30,33,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,45,16,19,15,19,72,19,34,20,17,...,25,29,39,49,29,24,31,34,26,0
92,11,8,21,8,14,17,5,16,7,11,...,9,20,18,12,20,14,15,11,7,1
30,110,126,133,155,122,108,125,147,153,131,...,162,159,150,129,163,164,143,111,70,0
46,49,74,39,112,96,52,32,52,74,98,...,84,60,73,43,16,45,48,25,20,0


In [20]:
X = dataset.drop(20, axis=1)
y = dataset[20]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
model = LogisticRegression(random_state=0).fit(X_train, y_train)

In [40]:
model.score(X_test, y_test)

0.5555555555555556