In [None]:
import gzip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, make_scorer
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import label_binarize

In [None]:
def load_fashion_mnist():
    """
    Loads Fashion MNIST dataset.
    
    Adapted from: https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py
    """
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'    
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    with gzip.open(TRAIN_LABELS, 'rb') as tr_labels_file, gzip.open(TEST_LABELS, 'rb') as ts_labels_file:
        train_labels = np.frombuffer(tr_labels_file.read(), dtype=np.uint8, offset=8)
        test_labels = np.frombuffer(ts_labels_file.read(), dtype=np.uint8, offset=8)

    with gzip.open(TRAIN_IMAGES, 'rb') as tr_images_file, gzip.open(TEST_IMAGES, 'rb') as ts_images_file:
        train_images = np.frombuffer(tr_images_file.read(), dtype=np.uint8, offset=16).reshape(len(train_labels), 784)
        test_images = np.frombuffer(ts_images_file.read(), dtype=np.uint8, offset=16).reshape(len(test_labels), 784)

    return train_images, train_labels, test_images, test_labels

train_images, train_labels, test_images, test_labels = load_fashion_mnist()

In [None]:
y = train_labels
#y = pd.DataFrame(train_labels, columns = ['labels'])
y

In [None]:
y = label_binarize(y, classes=[0, 1, 2, 3, 4,5,6,7,8])

In [None]:
#bar graph of all the labels
#ax = y.value_counts().plot(kind = 'bar', figsize=(14,10), title="Stage of cancer type composition", fontsize=20)
#ax.set_xlabel("Types of cancer", fontsize=20)
#ax.set_ylabel("Total amount", fontsize=20)
#ax.bar_label(ax.containers[0], fontsize=20)

In [None]:
# rescaling features using min-max scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(train_images)
#train_scaled is X
test_scaled = scaler.fit_transform(test_images)

In [None]:
X.shape

In [None]:
small_y = y[:10]
small_X = X[:10,:]
small_y.shape

In [None]:
# finding nearest neighbors using brute force and default n_neighbors & radius
# Create a nearest neighbor classifier
clf = KNeighborsClassifier(n_neighbors=3, algorithm='kd_tree')
clf

In [None]:
# Define a custom scorer based on ROC AUC
roc_auc_scorer = make_scorer(auc, greater_is_better=True, needs_proba=True, multi_class="ovo")
roc_auc_scorer

In [None]:
# Define a 10-fold cross-validation strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv

In [None]:
# Compute the confusion matrix
y_pred = clf.fit(small_X, small_y).predict(small_X)
cm = confusion_matrix(small_y.argmax(axis=1), y_pred.argmax(axis=1), labels = [0, 1, 2, 3, 4,5,6,7,8,9])
cm

In [None]:
# Compute the confusion matrix
y_pred = clf.fit(X, y).predict(X)
cm = confusion_matrix(y.argmax(axis=1), y_pred.argmax(axis=1), labels = [0, 1, 2, 3, 4,5,6,7,8,9])
cm

In [None]:
clf.predict_proba(small_X)[:, 1]

In [None]:
clf.predict_proba(small_X)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
urc = dict()
for i in range(y.shape[1]):
    fpr[i], tpr[i], _ = roc_curve(y[:, i], clf.predict_proba(small_X)[:, i])
    urc[i] = 1 - fpr[i]
    roc_auc[i] = auc(fpr[i], tpr[i])

print("ROC AUC score: {:.3f}".format(np.mean(list(roc_auc.values()))))

In [None]:
# Compute the ROC curve and the URC curve
fpr, tpr, _ = roc_curve(small_y, y_proba)
urc = 1 - fpr

print("ROC AUC score: {:.3f}".format(auc(fpr, tpr)))

# Plot the ROC curve and the URC curve
import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label="ROC curve")
plt.plot(urc, tpr, label="URC curve")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.legend()
plt.show()

In [None]:

y_pred = clf.fit(X, y).predict(X)
cm = confusion_matrix(y, y_pred)
print("Confusion matrix:\n", cm)

In [None]:
# Compute the ROC curve and the URC curve
fpr, tpr, _ = roc_curve(y, clf.predict_proba(X)[:, 1])
urc = 1 - fpr

print("ROC AUC score: {:.3f}".format(auc(fpr, tpr)))

# Plot the ROC curve and the URC curve
import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label="ROC curve")
plt.plot(urc, tpr, label="URC curve")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.legend()
plt.show()