In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### 1. Frame the problem and look at the big picture


How to idientify handwritten digits?

### 2. Get the Data

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

### 3. Explore the data to gain insights

In [None]:
print(mnist.DESCR)

In [None]:
# Capture the data and target into X, y variables respectively.
X, y = mnist.data, mnist.target
X.shape

In [None]:
X

In [None]:
y.shape

In [None]:
y

In [None]:
y[0]

In [None]:
y = y.astype(np.uint8)

### 4. Prepare the data to better expose the underlying data patterns to Machine Learning algorithms

In [None]:
# Split the data into a train and a test set.
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

### 5. Explore many different models and short-list the best ones

#### Stochastic Gradient Descent classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=51)
sgd_clf.fit(X_train, y_train)

#### Support Vector Mchine classifier

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma='auto', random_state=51)
svm_clf.fit(X_train[:10000], y_train[:10000])

#### K-nearest neighbors (KNN) classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=4)
knn_clf.fit(X_train, y_train)

#### Performance Measures


#### Cross-validation predictions

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

#### Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
macro_averaged_precision = precision_score(y_train, sgd_pred, average = 'macro')
print(f"Macro-Averaged Precision score using sklearn library : {macro_averaged_precision}")
micro_averaged_precision = precision_score(y_train, sgd_pred, average = 'micro')
print(f"Micro-Averaged Precision score using sklearn library : {micro_averaged_precision}")

In [None]:
macro_averaged_recall = recall_score(y_train, sgd_pred, average = 'macro')
print(f"Macro-averaged recall score using sklearn : {macro_averaged_recall}")
micro_averaged_recall = recall_score(y_train, sgd_pred, average = 'micro')
print(f"Micro-Averaged recall score using sklearn library : {micro_averaged_recall}")

#### F1 Score

In [None]:
macro_averaged_f1 = f1_score(y_train, sgd_pred, average = 'macro')
print(f"Macro-Averaged F1 score using sklearn library : {macro_averaged_f1}")
micro_averaged_f1 = f1_score(y_train, sgd_pred, average = 'micro')
print(f"Micro-Averaged F1 score using sklearn library : {micro_averaged_f1}")

#### Area under the ROC curve

In [None]:
from sklearn.preprocessing import LabelBinarizer
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [None]:
multiclass_roc_auc_score(y_train, sgd_pred)

#### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
sgd_conf_mx = confusion_matrix(y_train, sgd_pred)
sgd_conf_mx

In [None]:
# Visualize the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
plt.rc('font', size=9)
ConfusionMatrixDisplay.from_predictions(y_train, sgd_pred)
plt.show()

In [None]:
plt.rc('font', size=10)
ConfusionMatrixDisplay.from_predictions(y_train, sgd_pred,
                                        normalize="true", values_format=".0%")
plt.show()