In [None]:
# Import the necessary modules
import pickle 
import glob 
import numpy as np 
import pandas as pd
import gc
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import cv2

In [None]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

image_paths_train = glob.glob('./Assignment2_BikeHorses/Assignment2_BikeHorses/cifar-10-python/cifar-10-batches-py/data_batch_*')
image_paths_test = glob.glob('./Assignment2_BikeHorses/Assignment2_BikeHorses/cifar-10-python/cifar-10-batches-py/test_batch')
extractor = cv2.SIFT_create()

def features(image, extractor):
    keypoints, descriptors = extractor.detectAndCompute(image, None)
    return keypoints, descriptors

In [None]:
image_dict = []

for i in image_paths_train:
    image_dict.append(unpickle(i))

image_dict.append(unpickle(image_paths_test[0]))

image_dict[0].keys()
dat = image_dict[0][b'data']

container = np.vstack([d[b'data'] for d in image_dict])
labels = np.vstack([d[b'labels'] for d in image_dict])

In [None]:
def arrayToImage(img):
    red = np.reshape(img[:img.shape[0]//3],(32,32,))
    green =  np.reshape(img[img.shape[0]//3:2*img.shape[0]//3],(32,32,))
    blue = np.reshape(img[2*img.shape[0]//3:img.shape[0]],(32,32,))
    img = np.stack([red, green, blue], axis=2)
    return img 

images = []

for i in container:
    images.append(arrayToImage(i))

indicies_used = []

In [None]:
def preprocess(images):

    descriptors = pd.DataFrame([])
    for i in range(0,len(images)):
        _,descriptor = features(images[i],extractor)
        if descriptor is not None:
            descriptors=pd.concat((descriptors,pd.DataFrame(descriptor)),axis=0)
            indicies_used.append(i)
    return descriptors  

img2v = preprocess(images)
ind = pd.DataFrame(indicies_used)
print(ind.shape)
print(img2v)

In [None]:
kmeans = KMeans(n_clusters=12, random_state=42,verbose=0)
kmeans.fit(img2v)
n_clusters = 500 

im2v = []

for i in range(len(images)):
    _,descriptor = features(images[i],extractor)
    img_vec = [0]*n_clusters
    if descriptor is not None:
        for d in descriptor:
            s = d.reshape(1,-1)
            c = kmeans.predict(s)
            img_vec[c[0]] +=1 
        im2v.append(img_vec)

In [None]:
df = pd.DataFrame([*im2v])

lab = []

for l in labels:
    for k in l:
        lab.append(k)

lab_final = [lab[i] for i in indicies_used]
labels = pd.DataFrame({"labels":lab_final})
df = pd.concat([df,labels],axis=1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

df_train=df.drop(['labels'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df_train, df['labels'], test_size=0.20, random_state=42,stratify=df['labels'],shuffle=True)

from sklearn.metrics import accuracy_score


# import optuna
# from sklearn.model_selection import cross_val_score

# def objective_svm(trial):
#     C = trial.suggest_loguniform('C', 1e-10, 1e10)
#     gamma = trial.suggest_loguniform('gamma', 1e-10, 1e10)
#     clf = SVC(C=C, gamma=gamma)
#     return cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3).mean()

# def objective_lr(trial):
#     C = trial.suggest_loguniform('C', 1e-10, 1e10)
#     clf = LogisticRegression(C=C, random_state=42)
#     return cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3).mean()

# def objective_knn(trial):
#     n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
#     knn_clf = KNeighborsClassifier(n_neighbors=n_neighbors)
#     return cross_val_score(knn_clf, X_train, y_train, n_jobs=-1, cv=3).mean()

# study_svm = optuna.create_study(direction='maximize')
# study_svm.optimize(objective_svm, n_trials=100)

# study_lr = optuna.create_study(direction='maximize')
# study_lr.optimize(objective_lr, n_trials=100)

# study_knn = optuna.create_study(direction='maximize')
# study_knn.optimize(objective_knn, n_trials=100)

# # Get best parameters
# best_params_svm = study_svm.best_params
# best_params_lr = study_lr.best_params
# best_params_knn = study_knn.best_params

# Train classifiers with best parameters
best_svm = SVC()
best_svm.fit(X_train, y_train)
y_pred = best_svm.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)
# svm_accuracy = silhouette_scoree(X_test, y_test)

print("SVM Accuracy:", test_accuracy)


In [None]:
best_lr = LogisticRegression(random_state=42)
best_lr.fit(X_train, y_train)
y_pred = best_lr.predict(X_test)
lr_accuracy = best_lr.score(X_test, y_test)

best_knn = KNeighborsClassifier()
best_knn.fit(X_train, y_train)
y_pred = best_knn.predict(X_test)
knn_accuracy = best_knn.score(X_test, y_test)
print("LR Accuracy:", lr_accuracy)
print("KNN Accuracy:", knn_accuracy)