In [119]:
import numpy as np 
import matplotlib.pyplot as plt 
import cv2
import os
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [120]:
def load_dataset(folder_path, image_size):
    images = []
    labels = []
    for label, class_name in enumerate(['Bikes', 'Horses']):
        class_folder = os.path.join(folder_path, class_name)
        for file_name in os.listdir(class_folder):
            img = cv2.imread(os.path.join(class_folder, file_name), cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, image_size)  # Resize image to a common size
            images.append(img)
            labels.append(label)
    return np.array(images), np.array(labels)

In [121]:
def getDescriptorsArray(images, extractor):
    descriptors = []

    for i in range(0,len(images)): 
        keypoints,descriptor= extractor.detectAndCompute(images[i], None)
        descriptors.extend(descriptor)
    
    return np.array(descriptors)

In [122]:
def image2vec(images,kmeans,extractor,n_clusters):
    vec = []
    for image in images:

        keypoints,descriptor = extractor.detectAndCompute(image, None)
        img_vec = [0]*n_clusters
        for d in descriptor:
            s = d.reshape(1,-1)
            c = kmeans.predict(s)
            img_vec[c[0]] +=1 
        vec.append(img_vec)

    return np.array(vec)

In [124]:
def generate_codebook(descriptors, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, verbose=0)
    kmeans.fit(descriptors)
    return kmeans

In [125]:
folder_path = "./Assignment2_BikeHorses/Assignment2_BikeHorses"
images,labels = load_dataset(folder_path, (100,100))

extractor = cv2.SIFT_create()
descriptors = getDescriptorsArray(images, extractor)
print(descriptors)

[[  4. 100.  61. ...   0.   0.   0.]
 [ 33.   0.   0. ...   0.   3.  27.]
 [  2.  33. 138. ...   0.   0.  10.]
 ...
 [  0.   0.   9. ...   7.  18.  16.]
 [ 28.   0.   0. ...   3.   5.  23.]
 [ 62.  11.   0. ...   0.   0.  10.]]


In [126]:
num_clusters = 200

kmeans = generate_codebook(descriptors, num_clusters)
vec = image2vec(images,kmeans,extractor,num_clusters)

In [127]:
df = pd.DataFrame(vec)
labels = pd.DataFrame({"labels":labels})

In [128]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,1,0,1,0,0,0,0,0,0,1,...,0,1,0,0,0,2,0,0,0,0
1,0,0,0,0,1,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
2,1,1,2,0,0,1,1,0,2,0,...,1,0,1,0,0,0,0,0,1,0
3,0,0,0,1,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,2,0,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [129]:
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42, stratify=labels, shuffle=True)

In [130]:
svc = SVC()
svc.fit(X_train,y_train)
print(svc.score(X_test,y_test))

0.9444444444444444


  y = column_or_1d(y, warn=True)


In [131]:
svc_linear = SVC(kernel='linear')
svc_linear.fit(X_train,y_train)
print(svc_linear.score(X_test,y_test))

0.8888888888888888


  y = column_or_1d(y, warn=True)


In [132]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
print(lr.score(X_test,y_test))

0.9444444444444444


  y = column_or_1d(y, warn=True)
