In [1]:
# Import the necessary modules
import pickle # for loading and saving data
import glob # for finding files that match a pattern
import numpy as np # for numerical operations
import pandas as pd # for data manipulation and analysis
import gc # for garbage collection
from sklearn.cluster import KMeans # for clustering
from sklearn.model_selection import train_test_split # for splitting data into train and test sets
from sklearn.svm import SVC # for support vector machine classifier
from sklearn.linear_model import LogisticRegression # for logistic regression classifier
import cv2

# Define a function to load the data from a file
def unpickle(file):
    # Open the file in read-binary mode
    with open(file, 'rb') as fo:
        # Load the data as a dictionary
        dict = pickle.load(fo, encoding='bytes')
    # Return the dictionary
    return dict

# Find the paths of the training and test data files
image_paths_train = glob.glob('./Assignment2_BikeHorses/Assignment2_BikeHorses/cifar-10-python/cifar-10-batches-py/data_batch_*')
image_paths_test = glob.glob('./Assignment2_BikeHorses/Assignment2_BikeHorses/cifar-10-python/cifar-10-batches-py/test_batch')
extractor = cv2.SIFT_create()

def features(image, extractor):
    keypoints, descriptors = extractor.detectAndCompute(image, None)
    return keypoints, descriptors

# Initialize an empty list to store the data dictionaries
image_dict = []

# Loop over the training data files
for i in image_paths_train:
    # Load the data from each file and append it to the list
    image_dict.append(unpickle(i))

# Load the test data from the file and append it to the list
image_dict.append(unpickle(image_paths_test[0]))

# Check the keys of the first dictionary in the list
image_dict[0].keys()

# Get the image data from the first dictionary in the list
dat = image_dict[0][b'data']

# Stack the image data from all the dictionaries in the list vertically
container = np.vstack([d[b'data'] for d in image_dict])

# Stack the labels from all the dictionaries in the list vertically
labels = np.vstack([d[b'labels'] for d in image_dict])

# Define a function to convert the image data from a 1D array to a 3D array
def arrayToImage(img):
    # Reshape the first third of the array to a 32x32 matrix for the red channel
    red = np.reshape(img[:img.shape[0]//3],(32,32,))
    # Reshape the second third of the array to a 32x32 matrix for the green channel
    green =  np.reshape(img[img.shape[0]//3:2*img.shape[0]//3],(32,32,))
    # Reshape the last third of the array to a 32x32 matrix for the blue channel
    blue = np.reshape(img[2*img.shape[0]//3:img.shape[0]],(32,32,))

    # Stack the three matrices along the third dimension to form a 32x32x3 array
    img = np.stack([red, green, blue], axis=2)
    # Return the 3D array
    return img 

# Initialize an empty list to store the images as 3D arrays
images = []

# Loop over the image data in the container
for i in container:
    # Convert each image data to a 3D array and append it to the list
    images.append(arrayToImage(i))

# Initialize an empty list to store the indices of the images that have features
indicies_used = [] 

# Define a function to preprocess the images and extract features using SIFT
def preprocess(images):
    # Initialize an empty dataframe to store the features
    descriptors = pd.DataFrame([])
    # Loop over the images in the list
    for i in range(0,len(images)):
        # Use the features function (defined elsewhere) to get the keypoints and descriptors of each image
        _,descriptor = features(images[i],extractor)
        # If the descriptor is not None
        if descriptor is not None:
            # Concatenate the descriptor to the dataframe
            descriptors=pd.concat((descriptors,pd.DataFrame(descriptor)),axis=0)
            # Append the index of the image to the list
            indicies_used.append(i)
    # Return the dataframe of features
    return descriptors  

# Preprocess the images and get the features
img2v = preprocess(images)

# Convert the list of indices to a dataframe
ind = pd.DataFrame(indicies_used)

# Check the shape of the dataframe
ind.shape

# Check the dataframe of features
img2v

# Initialize a K-means object with 12 clusters, a fixed random state, and no verbosity
kmeans = KMeans(n_clusters=12, random_state=42,verbose=0)

# Fit the K-means object to the features
kmeans.fit(img2v)

# Set the number of clusters
n_clusters = 500 

# Initialize an empty list to store the image vectors
im2v = []

# Loop over the images in the list
for i in range(len(images)):
    # Use the features function to get the keypoints and descriptors of each image
    _,descriptor = features(images[i],extractor)
    # Initialize an image vector with zeros
    img_vec = [0]*n_clusters
    # If the descriptor is not None
    if descriptor is not None:
        # Loop over the descriptors of the image
        for d in descriptor:
            # Reshape the descriptor to a 1x128 array
            s = d.reshape(1,-1)
            # Predict the cluster label of the descriptor using the K-means object
            c = kmeans.predict(s)
            # Increment the corresponding element of the image vector by 1
            img_vec[c[0]] +=1 
        # Append the image vector to the list
        im2v.append(img_vec)

# Convert the list of image vectors to a dataframe
df = pd.DataFrame([*im2v])

# Initialize an empty list to store the labels
lab = []

# Loop over the labels in the container
for l in labels:
    # Loop over the elements of each label
    for k in l:
        # Append the element to the list
        lab.append(k)

# Get the final labels by selecting the ones that correspond to the indices of the images that have features
lab_final = [lab[i] for i in indicies_used]

# Convert the list of final labels to a dataframe
labels = pd.DataFrame({"labels":lab_final})

# Concatenate the dataframe of image vectors and the dataframe of labels along the columns
df = pd.concat([df,labels],axis =1 )

# Drop the unnecessary columns from the dataframe



In [None]:
from sklearn.neighbors import KNeighborsClassifier


df_train=df.drop(['labels'], axis=1)

# Split the dataframe into train and test sets, with 20% of the data for testing, a fixed random state, stratified sampling, and shuffling
X_train, X_test, y_train, y_test = train_test_split(df_train, df['labels'], test_size=0.20, random_state=42,stratify=df['labels'],shuffle=True)

from sklearn.metrics import accuracy_score


# import optuna
# from sklearn.model_selection import cross_val_score

# def objective_svm(trial):
#     C = trial.suggest_loguniform('C', 1e-10, 1e10)
#     gamma = trial.suggest_loguniform('gamma', 1e-10, 1e10)
#     clf = SVC(C=C, gamma=gamma)
#     return cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3).mean()

# def objective_lr(trial):
#     C = trial.suggest_loguniform('C', 1e-10, 1e10)
#     clf = LogisticRegression(C=C, random_state=42)
#     return cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3).mean()

# def objective_knn(trial):
#     n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
#     knn_clf = KNeighborsClassifier(n_neighbors=n_neighbors)
#     return cross_val_score(knn_clf, X_train, y_train, n_jobs=-1, cv=3).mean()

# study_svm = optuna.create_study(direction='maximize')
# study_svm.optimize(objective_svm, n_trials=100)

# study_lr = optuna.create_study(direction='maximize')
# study_lr.optimize(objective_lr, n_trials=100)

# study_knn = optuna.create_study(direction='maximize')
# study_knn.optimize(objective_knn, n_trials=100)

# # Get best parameters
# best_params_svm = study_svm.best_params
# best_params_lr = study_lr.best_params
# best_params_knn = study_knn.best_params

# Train classifiers with best parameters
best_svm = SVC()
best_svm.fit(X_train, y_train)
test_accuracy = accuracy_score(X_test, y_test)
# svm_accuracy = silhouette_scoree(X_test, y_test)

print("SVM Accuracy:", test_accuracy)

# Initialize a support vector machine classifier
# clf = SVC()

# Fit the classifier to the training data
# clf.fit(X_train,y_train)

# # Evaluate the classifier on the test data
# print("SVC")
# print(clf.score(X_test,y_test))

# # Initialize a logistic regression classifier with a fixed random state
# clf = LogisticRegression(random_state=42)

# # Fit the classifier to the training data
# clf.fit(X_train, y_train)
# print("LR")
# # Evaluate the classifier on the test data
# print(clf.score(X_test,y_test))

# knn_clf = KNeighborsClassifier(n_neighbors=5)
# knn_clf.fit(X_train, y_train)

# knn_accuracy = knn_clf.score(X_test, y_test)
# print("KNN Accuracy:", knn_accuracy)


In [None]:

best_lr = LogisticRegression(random_state=42)
best_lr.fit(X_train, y_train)
lr_accuracy = best_lr.score(X_test, y_test)

best_knn = KNeighborsClassifier()
best_knn.fit(X_train, y_train)
knn_accuracy = best_knn.score(X_test, y_test)
print("LR Accuracy:", lr_accuracy)
print("KNN Accuracy:", knn_accuracy)



NameError: name 'best_params_lr' is not defined