# Logistic Regression MNIST Classifier
handwritten digit classification using the MNIST dataset and a Logistic Regression model

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import tensorflow as tf

# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Normalize pixel values
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Flatten images: (28, 28) -> (784,)
x_train_flat = x_train.reshape((x_train.shape[0], -1))
x_test_flat = x_test.reshape((x_test.shape[0], -1))

# Function to get a balanced subset of samples_per_class images per digit
def get_balanced_subset(x, y, samples_per_class=1000, random_state=42):
    np.random.seed(random_state)
    x_balanced = []
    y_balanced = []
    for digit in range(10):
        idx = np.where(y == digit)[0]
        chosen_idx = np.random.choice(idx, samples_per_class, replace=False)
        x_balanced.append(x[chosen_idx])
        y_balanced.append(y[chosen_idx])
    x_balanced = np.vstack(x_balanced)
    y_balanced = np.hstack(y_balanced)
    return shuffle(x_balanced, y_balanced, random_state=random_state)

# Get balanced training subset
x_train_balanced, y_train_balanced = get_balanced_subset(x_train_flat, y_train, samples_per_class=1000)

# Train Logistic Regression model on balanced subset
logreg = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')
logreg.fit(x_train_balanced, y_train_balanced)

# Evaluate on test set
y_pred = logreg.predict(x_test_flat)
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           0       0.92      0.97      0.95       980
           1       0.96      0.98      0.97      1135
           2       0.90      0.86      0.88      1032
           3       0.88      0.89      0.89      1010
           4       0.91      0.92      0.92       982
           5       0.87      0.85      0.86       892
           6       0.94      0.93      0.94       958
           7       0.92      0.91      0.92      1028
           8       0.87      0.84      0.86       974
           9       0.89      0.89      0.89      1009

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [8]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[ 955    0    2    1    0   14    6    1    1    0]
 [   0 1109    4    1    1    2    4    2   12    0]
 [  15   14  892   30   13    6   12    8   37    5]
 [   4    3   22  902    3   26    2   15   24    9]
 [   2    3    7    3  907    2    9    3    5   41]
 [  17    1   11   32    7  754   15   10   37    8]
 [  16    3   12    0   14   16  893    3    1    0]
 [   2   10   22    8    6    2    0  935    2   41]
 [  11   13   15   32   10   41   11   12  820    9]
 [  11    5    2   16   35    7    0   26    4  903]]


In [9]:

# Save the model
joblib.dump(logreg, "logistic_digit_model.pkl")
print("Model saved as 'logistic_digit_model.pkl'")


Model saved as 'logistic_digit_model.pkl'
