In [None]:
import gzip
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, make_scorer, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression

In [None]:
def load_fashion_mnist():
    """
    Loads Fashion MNIST dataset.
    
    Adapted from: https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py
    """
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'    
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    with gzip.open(TRAIN_LABELS, 'rb') as tr_labels_file, gzip.open(TEST_LABELS, 'rb') as ts_labels_file:
        train_labels = np.frombuffer(tr_labels_file.read(), dtype=np.uint8, offset=8)
        test_labels = np.frombuffer(ts_labels_file.read(), dtype=np.uint8, offset=8)

    with gzip.open(TRAIN_IMAGES, 'rb') as tr_images_file, gzip.open(TEST_IMAGES, 'rb') as ts_images_file:
        train_images = np.frombuffer(tr_images_file.read(), dtype=np.uint8, offset=16).reshape(len(train_labels), 784)
        test_images = np.frombuffer(ts_images_file.read(), dtype=np.uint8, offset=16).reshape(len(test_labels), 784)

    return train_images, train_labels, test_images, test_labels

train_images, train_labels, test_images, test_labels = load_fashion_mnist()

In [None]:
y = train_labels
y.shape

In [None]:
# rescaling features using min-max scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(train_images)
#train_scaled is X
test_scaled = scaler.fit_transform(test_images)
X.shape

In [None]:
small_y = y[:10000]
small_X = X[:10000,:]

In [None]:
print(small_X.shape)
print(small_y.shape)
#small_X and X have the same shape
#small_y and y have the same shape

In [None]:
# Create a nearest neighbor classifier
clf = KNeighborsClassifier(n_neighbors=3, algorithm='kd_tree')
clf

In [None]:
y_pred = cross_val_predict(clf, small_X, small_y, cv=5)

In [None]:
y_pred

In [None]:
cm1 = confusion_matrix(small_y, y_pred, labels = [0, 1, 2, 3, 4,5,6,7,8,9])
cm1

In [None]:
accuracy = accuracy_score(small_y, y_pred)
accuracy

In [None]:
y_score = clf.predict_proba(small_X)
y_score

In [None]:
lb = LabelBinarizer()
y_binarized = lb.fit_transform(small_y)
y_binarized

In [None]:
# Compute and plot ROC curve for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(lb.classes_)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_binarized[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# Compute micro-averaged ROC curve and AUC score
fpr["micro"], tpr["micro"], _ = roc_curve(y_binarized.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
# Compute macro-averaged ROC curve and AUC score
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

In [None]:
# Plot ROC curve for each class and the macro/micro averages
plt.figure(figsize=(8,6))
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (AUC = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=lw)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (AUC = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=lw)

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Example')
plt.legend(loc="lower right")
plt.show()

Feature Selection

In [None]:
# Compute the correlation coefficients between each feature and the target variable
corr_coef = []
for i in range(small_X.shape[1]):
    corr_coef.append(abs(np.corrcoef(X[:, i], y)[0, 1]))

In [None]:
# Select the top 10 features based on their correlation coefficients
selector = SelectKBest(score_func=f_classif, k=400)
selector.fit(small_X, small_y)
X_selected = selector.transform(small_X)
X_selected[0].shape
#small_X[0].shape

In [None]:
# Print the indices of the selected features
selected_features = selector.get_support(indices=True)

In [None]:
plt.figure()
plt.imshow(X_selected[0].reshape(20, 20))
plt.colorbar()
plt.grid(False)
plt.show()

In [None]:
plt.figure()
plt.imshow(small_X[0].reshape(28, 28))
plt.colorbar()
plt.grid(False)
plt.show()

In [None]:
y_pred2 = cross_val_predict(clf, X_selected, small_y, cv=5)

In [None]:
cm2 = confusion_matrix(small_y, y_pred2, labels = [0, 1, 2, 3, 4,5,6,7,8,9])
cm2

In [None]:
accuracy2 = accuracy_score(small_y, y_pred2)
accuracy2

In [None]:
clf.fit(X_selected,small_y)

In [None]:
y_score2 = clf.predict_proba(X_selected)
y_score2

In [None]:
# Compute and plot ROC curve for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(lb.classes_)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_binarized[:, i], y_score2[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
# Compute micro-averaged ROC curve and AUC score
fpr["micro"], tpr["micro"], _ = roc_curve(y_binarized.ravel(), y_score2.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-averaged ROC curve and AUC score
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot ROC curve for each class and the macro/micro averages
plt.figure(figsize=(8,6))
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (AUC = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=lw)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (AUC = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=lw)

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Example')
plt.legend(loc="lower right")
plt.show()

In [None]:
lr = LogisticRegression(penalty='l1', solver='saga')
lr.fit(small_X, small_y)