In [1]:
from sklearn.datasets import fetch_openml
import numpy as np

# Load MNIST data from OpenML
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"].astype(np.uint8)

# Confirm shape
print(X.shape, y.shape)


(70000, 784) (70000,)


In [2]:
from sklearn.model_selection import train_test_split

# Split into 60,000 train and 10,000 test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=10000, random_state=42, stratify=y)

print(X_train.shape, X_test.shape)


(60000, 784) (10000, 784)


In [5]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

# SGD Classifier (hinge = linear SVM)
sgd_clf = SGDClassifier(loss="hinge", random_state=42)
sgd_clf.fit(X_train, y_train)

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)


In [6]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# SGD Classifier evaluation
print("SGD Classifier Accuracy:", accuracy_score(y_test, sgd_clf.predict(X_test)))
print("Confusion Matrix:\n", confusion_matrix(y_test, sgd_clf.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, sgd_clf.predict(X_test)))

# Random Forest evaluation
print("RF Classifier Accuracy:", accuracy_score(y_test, rf_clf.predict(X_test)))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_clf.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, rf_clf.predict(X_test)))


SGD Classifier Accuracy: 0.8856
Confusion Matrix:
 [[ 951    0    2    7    1    0   11    0    7    7]
 [   1 1080    4   15    0    4    1    3   11    6]
 [   7    9  819   60    8    1   30   17   38   10]
 [   0    2   23  938    1   11    2   11   15   17]
 [   2    5   11   11  861    0   13    5    3   64]
 [  12    1   10  108    6  649   28   11   48   29]
 [   8    0    2    2    3   11  952    0    2    2]
 [   2    3    8    9    5    1    0  956    3   55]
 [   8   13    5   87    6   22   12   11  756   55]
 [   1    7    9   29   20    4    1   27    2  894]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       986
           1       0.96      0.96      0.96      1125
           2       0.92      0.82      0.87       999
           3       0.74      0.92      0.82      1020
           4       0.95      0.88      0.91       975
           5       0.92      0.72      0.81       902
           6   

### ❌ Error Analysis Report

- **Common Mistakes**:
  1. 9 → 4
  2. 5 → 3
  3. 8 → 0

- **Possible Reasons**:
  - Similar shapes (e.g., 9 and 4 both have loops)
  - Blurry edges in some digits
  - Incomplete strokes

- **Proposed Fixes**:
  1. Apply **data augmentation** to make the model robust.
  2. Use **pixel scaling** (StandardScaler).
  3. Train using a CNN (future work).

- **Applied Fix**: Scaling pixel values and retraining SGD.



In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Pipeline with scaling for SGD
scaled_sgd = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd", SGDClassifier(loss="hinge", random_state=42))
])

scaled_sgd.fit(X_train, y_train)
print("Improved SGD Accuracy:", scaled_sgd.score(X_test, y_test))


Improved SGD Accuracy: 0.9025
