In [50]:
import numpy as np 
import matplotlib.pyplot as plt 
import cv2
import os
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [51]:
def load_dataset(folder_path, image_size):
    images = []
    labels = []
    for label, class_name in enumerate(['Bikes', 'Horses']):
        class_folder = os.path.join(folder_path, class_name)
        for file_name in os.listdir(class_folder):
            img = cv2.imread(os.path.join(class_folder, file_name), cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, image_size)  # Resize image to a common size
            images.append(img)
            labels.append(label)
    return np.array(images), np.array(labels)

In [52]:
def getDescriptorsArray(images, extractor):
    descriptors = []

    for i in range(0,len(images)): 
        keypoints,descriptor= extractor.detectAndCompute(images[i], None)
        descriptors.extend(descriptor)
    
    return np.array(descriptors)

In [None]:
def image2vec(images,kmeans,extractor,n_clusters=200):
    im2v = []
    for i in range(len(images)):
        keypoints,descriptor = extractor.detectAndCompute(images[i], None)
        img_vec = [0]*n_clusters
        for d in descriptor:
            s = d.reshape(1,-1)
            c = kmeans.predict(s)
            img_vec[c[0]] +=1 
        im2v.append(img_vec)
    im2v = np.array(im2v)
    return im2v 

In [53]:
images,labels = load_dataset()
extractor = cv2.xfeatures2d.SIFT_create()
descriptors = getDescriptorsArray(images, extractor)

In [55]:
kmeans = KMeans(n_clusters=200, random_state=42, verbose=0).fit(descriptors)
vec = image2vec(images,kmeans,extractor,200) 

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
df = pd.DataFrame([*vec])
labels = pd.DataFrame({"labels":labels})
df = pd.concat([df,labels],axis =1 )

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,labels
0,7,5,9,14,1,12,8,10,7,1,...,4,6,12,7,9,6,7,3,4,0
1,7,10,13,14,27,8,14,11,3,5,...,24,7,6,13,9,8,7,10,21,0
2,4,2,3,3,5,18,4,0,1,6,...,1,2,4,2,9,3,7,1,1,0
3,0,2,0,3,1,26,0,0,0,0,...,0,1,1,5,2,2,0,0,18,0
4,4,7,0,6,20,21,9,7,4,8,...,10,3,1,4,3,0,4,1,0,0


In [None]:
df_train=df.drop(['labels'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df_train, df['labels'], test_size=0.2, random_state=42,stratify=df['labels'],shuffle=True)

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
print(svc.score(X_test,y_test))

In [None]:
svc_linear = SVC(kernel='linear')
svc_linear.fit(X_train,y_train)
print(svc_linear.score(X_test,y_test))

1.0

In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
print(lr.score(X_test,y_test))

1.0

## Subpart 2: CIFAR 10

Data handling as written in the dataset doc which was present in the link given in ppt 

In [None]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
image_paths_train = glob.glob('cifar-10-batches-py/data_batch_*')
image_paths_test = glob.glob('cifar-10-batches-py/test_batch')
image_dict = []
for i in image_paths_train:
    image_dict.append(unpickle(i))
image_dict.append(unpickle(image_paths_test[0]))


In [None]:
image_dict[0].keys()

dict_keys([b'batch_label', b'labels', b'data', b'filenames'])

In [None]:
dat = image_dict[0][b'data']

In [None]:
dat.shape

(10000, 3072)

In [None]:
container = np.vstack([d[b'data'] for d in image_dict])
labels = np.vstack([d[b'labels'] for d in image_dict])

In [None]:
def arrayToImage(img):
    red = np.reshape(img[:img.shape[0]//3],(32,32,))
    green =  np.reshape(img[img.shape[0]//3:2*img.shape[0]//3],(32,32,))
    blue = np.reshape(img[2*img.shape[0]//3:img.shape[0]],(32,32,))

    img = np.stack([red, green, blue], axis=2)
    return img 

In [None]:
import gc 
images = []
for i in container:
    images.append(arrayToImage(i))

60

In [None]:
indicies_used = [] 

Same steps followed as above to obtain a classical dataset (excelsheet)

In [None]:
def preprocess(images):
    descriptors = pd.DataFrame([])
    for i in range(0,len(images)):
        _,descriptor = features(images[i],extractor)
        if descriptor is not None:
            descriptors=pd.concat((descriptors,pd.DataFrame(descriptor)),axis=0)
            indicies_used.append(i)
    return descriptors

In [None]:
img2v = preprocess(images)

In [None]:
ind = pd.DataFrame(indicies_used)
ind.shape

(59880, 1)

In [None]:
img2v

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,...,112.0,5.0,1.0,15.0,62.0,11.0,2.0,33.0,33.0,1.0
1,3.0,0.0,0.0,4.0,86.0,108.0,74.0,73.0,60.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19.0,10.0,5.0,5.0,3.0,5.0,7.0,24.0,28.0,2.0,...,8.0,6.0,2.0,33.0,100.0,23.0,1.0,0.0,9.0,10.0
3,32.0,8.0,7.0,35.0,29.0,8.0,1.0,4.0,146.0,16.0,...,0.0,1.0,1.0,10.0,54.0,52.0,4.0,1.0,4.0,2.0
4,0.0,0.0,1.0,14.0,131.0,32.0,3.0,1.0,43.0,11.0,...,3.0,6.0,7.0,27.0,54.0,95.0,6.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,71.0,71.0,0.0,0.0,0.0,0.0,1.0,19.0,150.0,68.0,...,5.0,1.0,59.0,150.0,61.0,6.0,0.0,2.0,3.0,2.0
12,25.0,28.0,69.0,52.0,7.0,3.0,2.0,4.0,23.0,93.0,...,115.0,48.0,17.0,11.0,29.0,4.0,0.0,1.0,51.0,84.0
13,20.0,38.0,21.0,12.0,12.0,4.0,0.0,5.0,124.0,62.0,...,0.0,4.0,10.0,87.0,33.0,0.0,0.0,0.0,0.0,0.0
14,0.0,7.0,113.0,147.0,38.0,0.0,0.0,0.0,52.0,73.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0


In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=12, random_state=42,verbose=0).fit(img2v)
n_clusters = 12 

In [None]:
im2v = []
for i in range(len(images)):
    _,descriptor = features(images[i],extractor)
    img_vec = [0]*n_clusters
    if descriptor is not None:
        for d in descriptor:
            s = d.reshape(1,-1)
            c = kmeans.predict(s)
            img_vec[c[0]] +=1 
        im2v.append(img_vec)


In [None]:
df = pd.DataFrame([*im2v])

In [None]:
labels

array([[6, 9, 9, ..., 1, 1, 5],
       [1, 6, 6, ..., 7, 2, 5],
       [1, 8, 5, ..., 9, 1, 1],
       [0, 6, 0, ..., 6, 2, 8],
       [8, 5, 0, ..., 3, 7, 1],
       [3, 8, 8, ..., 5, 1, 7]])

In [None]:
lab = []
for l in labels:
    for k in l:
        lab.append(k)

In [None]:
lab_final = [lab[i] for i in indicies_used]

In [None]:
df = pd.DataFrame([*im2v])
labels = pd.DataFrame({"labels":lab_final})
df = pd.concat([df,labels],axis =1 )

In [None]:
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,labels
0,0,0,2,10,0,0,0,1,1,1,3,0,0,6
1,1,0,1,0,2,3,3,5,0,3,0,1,0,9
2,2,1,1,3,0,0,1,0,2,1,1,1,2,9
3,3,0,3,2,0,0,0,3,0,1,2,0,0,4
4,4,1,1,1,1,5,1,2,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59875,59875,3,7,0,3,0,2,0,2,0,3,0,1,8
59876,59876,2,1,1,1,1,0,2,4,0,1,2,3,3
59877,59877,0,1,4,0,3,1,0,0,0,2,2,0,5
59878,59878,1,3,1,0,3,1,2,2,2,2,0,0,1


In [None]:
df_train=df.drop(['Unnamed: 0','labels'], axis=1)

SVC <br/>
Train Test = 0.2

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train, df['labels'], test_size=0.2, random_state=42,stratify=df['labels'],shuffle=True)
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

0.2348864395457582

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42).fit(X_train, y_train)
clf.score(X_test,y_test)

0.218937875751503