# Setup

In [None]:
# Import necessary libraries
import os
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Get MNIST Dataset

In [130]:
# Load MNIST dataset from OpenML
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

# Convert target to integers (it may come as string)
y = y.astype(np.uint8)

In [132]:
y.shape

(70000,)

In [134]:
X.shape

(70000, 784)

# Split Data 

In [138]:

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.14, random_state=42)


# KNN Classifier

In [140]:
knn_clf = KNeighborsClassifier()

# Train a KNN Classifier on the MNIST Dataset
knn_clf.fit(X_train, y_train)

# Predict output
y_knn_pred = knn_clf.predict(X_test)

print('Accuracy_score with default hyperparameters: n_neighbors= 5, weights= uniform:')
accuracy_score(y_test, y_knn_pred)

Accuracy_score with default hyperparameters: n_neighbors= 5, weights= uniform:


0.9692888480767269

# KNN Classifier Hyperparameters Fine-Tuning

In [142]:
# Define the hyperparameter grid combinations
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)

# Train the model on the training data
grid_search.fit(X_train, y_train)

print('Accuracy_score with tuned hyperparameters:', grid_search.best_params_)

print('Accuracy_score after tuning hyperparameters:', grid_search.best_score_)
print('Optimsed Model:', grid_search.best_estimator_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ....n_neighbors=3, weights=uniform;, score=0.971 total time=  16.2s
[CV 2/5] END ....n_neighbors=3, weights=uniform;, score=0.971 total time=  17.7s
[CV 3/5] END ....n_neighbors=3, weights=uniform;, score=0.972 total time=  18.4s
[CV 4/5] END ....n_neighbors=3, weights=uniform;, score=0.971 total time=  18.5s
[CV 5/5] END ....n_neighbors=3, weights=uniform;, score=0.970 total time=  17.9s
[CV 1/5] END ...n_neighbors=3, weights=distance;, score=0.973 total time=  17.5s
[CV 2/5] END ...n_neighbors=3, weights=distance;, score=0.972 total time=  17.3s
[CV 3/5] END ...n_neighbors=3, weights=distance;, score=0.973 total time=  17.9s
[CV 4/5] END ...n_neighbors=3, weights=distance;, score=0.971 total time=  17.5s
[CV 5/5] END ...n_neighbors=3, weights=distance;, score=0.971 total time=  17.5s
[CV 1/5] END ....n_neighbors=4, weights=uniform;, score=0.969 total time=  20.4s
[CV 2/5] END ....n_neighbors=4, weights=uniform;,

In [145]:
print('Accuracy_score after tuning hyperparameters:', grid_search.best_score_)

Accuracy_score after tuning hyperparameters: 0.9727237847427753


In [147]:
print('Optimsed-hyperparameter Model:')
grid_search.best_estimator_

Optimsed-hyperparameter Model:


 # Model Performance on the test set

In [149]:
from sklearn.metrics import accuracy_score

KNN_CLF_tuned = KNeighborsClassifier(n_neighbors=4, weights='distance')

y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Optimzed-hyperparameter Model accuracy on test set:', accuracy)

Optimzed-hyperparameter Model accuracy on test set: 0.9730639730639731


# Compare the accuracy of your KNN classifier with other classifiers 

# Random Forest Regressor

In [163]:
# Scaling the data (optional for RandomForest but we can apply it to compare results)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.transform(X_test.astype(np.float64))

# Define the RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)

# Define the parameter grid for fine-tuning Random Forest hyperparameters
param_grid = {
    'n_estimators': [200, 250, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],  # Maximum depth of the trees
    # 'max_features': ['sqrt', 'log2', None]  # The number of features to consider when looking for the best split
}

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(rf_clf, param_grid, cv=5)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best parameters from grid search
print("Best hyperparameters: ", grid_search.best_params_)

# Best estimator
best_model = grid_search.best_estimator_
print("Best model: ", best_model)

print('Accuracy_score after tuning hyperparameters:', grid_search.best_score_)

# Evaluate the model on test data
y_pred = best_model.predict(X_test_scaled)

print('Optimzed-hyperparameter Model scores on test set:')
print(classification_report(y_test, y_pred))



KeyboardInterrupt



# SGD Classifier

In [None]:
# Scale the data to zero mean and unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.transform(X_test.astype(np.float64))

# Define the SGDClassifier
sgd_clf = SGDClassifier(random_state=42)


  # Define the parameter grid for fine-tuning alpha and learning rate
param_grid = {
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],   # Regularization strength
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Different learning rate strategies
    # 'eta0': [0.0001, 0.001, 0.01, 0.1],  # Initial learning rate for 'constant' or 'invscaling'
    # 'max_iter': [1000],  # To ensure the model runs for enough iterations
    # 'tol': [1e-3]  # Tolerance for stopping criteria
}

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(sgd_clf, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best parameters from grid search
print("Best hyperparameters: ", grid_search.best_params_)

# Best estimator
best_model = grid_search.best_estimator_

# Evaluate the model on test data
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
