In [None]:
import gzip
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, make_scorer, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [None]:
def load_fashion_mnist():
    """
    Loads Fashion MNIST dataset.
    
    Adapted from: https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py
    """
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'    
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    with gzip.open(TRAIN_LABELS, 'rb') as tr_labels_file, gzip.open(TEST_LABELS, 'rb') as ts_labels_file:
        train_labels = np.frombuffer(tr_labels_file.read(), dtype=np.uint8, offset=8)
        test_labels = np.frombuffer(ts_labels_file.read(), dtype=np.uint8, offset=8)

    with gzip.open(TRAIN_IMAGES, 'rb') as tr_images_file, gzip.open(TEST_IMAGES, 'rb') as ts_images_file:
        train_images = np.frombuffer(tr_images_file.read(), dtype=np.uint8, offset=16).reshape(len(train_labels), 784)
        test_images = np.frombuffer(ts_images_file.read(), dtype=np.uint8, offset=16).reshape(len(test_labels), 784)

    return train_images, train_labels, test_images, test_labels

train_images, train_labels, test_images, test_labels = load_fashion_mnist()

In [None]:
y = train_labels
y.shape

In [None]:
# rescaling features using min-max scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(train_images)
#train_scaled is X
test_scaled = scaler.fit_transform(test_images)
X.shape

In [None]:
small_y = y[:10000]
small_X = X[:10000,:]

In [None]:
print(small_X.shape)
print(small_y.shape)
#small_X and X have the same shape
#small_y and y have the same shape

In [None]:
# Create a nearest neighbor classifier
clf = KNeighborsClassifier(n_neighbors=3, algorithm='kd_tree')
clf

In [None]:
y_pred = cross_val_predict(clf, small_X, small_y, cv=5)

In [None]:
y_pred

In [None]:
cm1 = confusion_matrix(small_y, y_pred, labels = [0, 1, 2, 3, 4,5,6,7,8,9])
cm1

In [None]:
accuracy = accuracy_score(small_y, y_pred)
accuracy

In [None]:
lb = LabelBinarizer()
y_binarized = lb.fit_transform(small_y)
y_binarized

In [None]:
# # Selected high corr features to run linear regression
# list_features = [X_train['alcohol'],X_train['density']]
# X_1 = pd.concat(list_features, axis=1, ignore_index=True)

# Defining all parameters for grid search
# Note – some are seperated out to decrease running time 

knn_tuning = KNeighborsClassifier()
low_k = range(1, 11, 2)
high_k = range(13, 21, 2)
test_k = range(15, 17, 2)
weights = ['uniform', 'distance']
algs = ['auto', 'ball_tree', 'kd_tree', 'brute']
alg_auto = ['auto']
alg_ball = ['ball_tree']
alg_kd = ['kd_tree']
alg_brute = ['brute']
metrics = ['euclidean', 'manhattan', 'minkowski']
metric_euc = ['euclidean']
metric_man = ['manhattan']
# metric_mink = ['minkowski']

# params = dict(n_neighbors=n_neighbors, weights=weights, algorithm=algs, metric=metrics)

# # Use SGDRegressor to run linear regression
# white_grid = linear_model.SGDRegressor(loss = 'squared_error', fit_intercept = True, learning_rate = 'constant')

# Set parameters for grid search
# param_grid = {'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 'penalty': ['l1', 'l2'], 'eta0':[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}

# Implement GridSearchCV and fit with all sets of parameters
# grid_search_1 = GridSearchCV(white_grid, param_grid, cv=5, scoring = 'neg_mean_squared_error')
# grid_search_1.fit(X_1, y_train['quality'])


# Find the best parameters
# print(grid_search.best_params_)
# print(grid_search.best_score_)

# Get all scores of grid search
# results_1 = pd.DataFrame(grid_search_1.cv_results_)
# results = pd.DataFrame(grid_search.cv_results_)

# Print the mean squared error of best parameters
# mean_test_1 = results_1.index[results_1['params'] == grid_search_1.best_params_].tolist()[0]
# score_1 = results_1['mean_test_score']
# score_1 = score_1[mean_test_1]
# print(score_1)

# # Examine if any parameters were not used
# print(grid_search_1.best_estimator_.coef_)

In [None]:
# Grid search with a range from 1-11 neighbors, the kd-tree algorithm, and Euclidean distance metric
# Takes ~18 minutes to run
params_kd_euc = dict(n_neighbors=low_k, weights=weights, algorithm=alg_kd, metric=metric_euc)

grid_kd_euc = GridSearchCV(knn_tuning, params_kd_euc, cv=5, scoring='accuracy')
grid_kd_euc.fit(small_X, small_y)

print(grid_kd_euc.best_params_)
print(grid_kd_euc.best_score_)

In [None]:
# # Grid search with a range from 13-21 neighbors, the kd-tree algorithm, and Euclidean distance metric
# # have not been able to get this to compute – server times out, at the bottom it says Python 3.8 | Unknown

# params_kd_euc = dict(n_neighbors=high_k, weights=weights, algorithm=alg_kd, metric=metric_euc)

# grid_kd_euc = GridSearchCV(knn_tuning, params_kd_euc, cv=5, scoring='accuracy')
# grid_kd_euc.fit(small_X, small_y)

# print(grid_kd_euc.best_params_)
# print(grid_kd_euc.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the kd-tree algorithm, and Manhattan distance metric
# Takes ~18 minutes to run
params_kd_man = dict(n_neighbors=low_k, weights=weights, algorithm=alg_kd, metric=metric_man)

grid_kd_man = GridSearchCV(knn_tuning, params_kd_man, cv=5, scoring='accuracy')
grid_kd_man.fit(small_X, small_y)

print(grid_kd_man.best_params_)
print(grid_kd_man.best_score_)

In [None]:
# # Grid search with a range from 13-21 neighbors, the kd-tree algorithm, and Manhattan distance metric
# params_kd_man = dict(n_neighbors=high_k, weights=weights, algorithm=alg_kd, metric=metric_man)

# grid_kd_man = GridSearchCV(knn_tuning, params_kd_man, cv=5, scoring='accuracy')
# grid_kd_man.fit(small_X, small_y)

# print(grid_kd_man.best_params_)
# print(grid_kd_man.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the auto algorithm, and Euclidean distance metric
# Takes ~2 minutes
params_auto_euc = dict(n_neighbors=low_k, weights=weights, algorithm=alg_auto, metric=metric_euc)

grid_auto_euc = GridSearchCV(knn_tuning, params_auto_euc, cv=5, scoring='accuracy')
grid_auto_euc.fit(small_X, small_y)

print(grid_auto_euc.best_params_)
print(grid_auto_euc.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the auto algorithm, and Manhattan distance metric
# Takes ~2 minutes
params_auto_man = dict(n_neighbors=low_k, weights=weights, algorithm=alg_auto, metric=metric_man)

grid_auto_man = GridSearchCV(knn_tuning, params_auto_man, cv=5, scoring='accuracy')
grid_auto_man.fit(small_X, small_y)

print(grid_auto_man.best_params_)
print(grid_auto_man.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the brute force algorithm, and Euclidean distance metric
# Took ~30 seconds
params_brute_euc = dict(n_neighbors=low_k, weights=weights, algorithm=alg_brute, metric=metric_euc)

grid_brute_euc = GridSearchCV(knn_tuning, params_brute_euc, cv=5, scoring='accuracy')
grid_brute_euc.fit(small_X, small_y)

print(grid_brute_euc.best_params_)
print(grid_brute_euc.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the brute force algorithm, and Manhattan distance metric
# Took about ~3 minutes
params_brute_man = dict(n_neighbors=low_k, weights=weights, algorithm=alg_brute, metric=metric_man)

grid_brute_man = GridSearchCV(knn_tuning, params_brute_man, cv=5, scoring='accuracy')
grid_brute_man.fit(small_X, small_y)

print(grid_brute_man.best_params_)
print(grid_brute_man.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the ball tree algorithm, and Euclidean distance metric
# Took about ~14 minutes
params_ball_euc = dict(n_neighbors=low_k, weights=weights, algorithm=alg_ball, metric=metric_euc)

grid_ball_euc = GridSearchCV(knn_tuning, params_ball_euc, cv=5, scoring='accuracy')
grid_ball_euc.fit(small_X, small_y)

print(grid_ball_euc.best_params_)
print(grid_ball_euc.best_score_)

In [None]:
# Grid search with a range from 1-11 neighbors, the ball tree algorithm, and Manhattan distance metric
# Took about ~13 minutes
params_ball_man = dict(n_neighbors=low_k, weights=weights, algorithm=alg_ball, metric=metric_man)

grid_ball_man = GridSearchCV(knn_tuning, params_ball_man, cv=5, scoring='accuracy')
grid_ball_man.fit(small_X, small_y)

print(grid_ball_man.best_params_)
print(grid_ball_man.best_score_)