# Hello, MNIST!

a quick throwaway comparison of a basic CNN to more classical techniques on the meme-iest image recognition dataset, MNIST!  

(I haven't used NNets too much in the past few years, wanted to refresh my memory.)
 
the short version is - classical approaches perform shockingly well here, achieving 95-98% accuracy. NN get to 99%+ with very little effort (single convolution+maxpool will get you there).  

of course, classical approaches don't work very well on many other image classifcation problems, but interesting to see just _how_ easy MNIST seems to be.  

## classical

pca / ica / umap
+ svm / gbt

In [3]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")


x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [4]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA, NMF, FastICA 
from sklearn.svm import SVC
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from umap import UMAP
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

N_COMPONENTS = 50

std_scaler = StandardScaler(with_mean=True, with_std=True)

pca = PCA(n_components=N_COMPONENTS, random_state=42)
nmf = NMF(n_components=N_COMPONENTS, random_state=42, max_iter=500, init='nndsvda')

svc = SVC(
    C=1.0,
    kernel='rbf',
    probability=False,
)
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features=.5,
    n_jobs=-1,
    random_state=42,
)
lgbc = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=.05,
    subsample=.9,
    random_state=42,
)
sklgbc = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=1_000,
    max_leaf_nodes=31,
    max_depth=6,
    min_samples_leaf=10,
    l2_regularization=1.0,
    max_bins=255,
    validation_fraction=0.1,
    n_iter_no_change=10,
    verbose=0,
    random_state=42,
)

pca_pipe = make_pipeline(std_scaler, pca, sklgbc)
nmf_pipe = make_pipeline(nmf, sklgbc)
lgb_pipe = make_pipeline(nmf, lgbc)
rf_pipe = make_pipeline(nmf, rf)
svm_pipe = make_pipeline(nmf, svc)
# note: have to reduce before umap or we'll get errors. 
# defaulting to nmf b/c all pos. image data
# edit: struggles past n=1k or so, even with preprocessing. omit.  
#umap_pipe = make_pipeline(smol_nmf, std_scaler, umap, gb)

# Get dims + reshape from 2d per sample to standard instances x features.
n, x_dim, y_dim, _ = x_train.shape
x_train_flat = x_train.reshape((n, x_dim * y_dim))

n, x_dim, y_dim, _ = x_test.shape
x_test_flat = x_test.reshape((n, x_dim * y_dim))

  from pandas import MultiIndex, Int64Index


In [6]:

for model in [pca_pipe, nmf_pipe, lgb_pipe, rf_pipe, svm_pipe]:
    model.fit(x_train_flat, y_train)

    #pred_probs = model.predict_proba(x_test_flat)
    pred_classes = model.predict(x_test_flat)

    # AUC as we'd often use in traditional balanced-ish tabular problems.
    #auc = roc_auc_score(
    #    y_test, pred_probs, multi_class="ovr", average="weighted",
    #)
    
    # Also accuracy, since MNIST is often judged this way, and it's an imbalanced, multiclass problem.
    acc = accuracy_score(
        y_test, pred_classes
    )

    print('\n\n')
    print(acc, '\n', model)




0.9632 
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=50, random_state=42)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(l2_regularization=1.0,
                                                learning_rate=0.05, max_depth=6,
                                                max_iter=1000,
                                                min_samples_leaf=10,
                                                random_state=42))])







0.9786 
 Pipeline(steps=[('nmf',
                 NMF(init='nndsvda', max_iter=500, n_components=50,
                     random_state=42)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(l2_regularization=1.0,
                                                learning_rate=0.05, max_depth=6,
                                                max_iter=1000,
                                                min_samples_leaf=10,
                                                random_state=42))])







0.9801 
 Pipeline(steps=[('nmf',
                 NMF(init='nndsvda', max_iter=500, n_components=50,
                     random_state=42)),
                ('lgbmclassifier',
                 LGBMClassifier(learning_rate=0.05, n_estimators=500,
                                random_state=42, subsample=0.9))])







0.9138 
 Pipeline(steps=[('nmf',
                 NMF(init='nndsvda', max_iter=500, n_components=50,
                     random_state=42)),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=8, max_features=0.5,
                                        min_samples_leaf=5,
                                        min_samples_split=10, n_jobs=-1,
                                        random_state=42))])







0.9548 
 Pipeline(steps=[('nmf',
                 NMF(init='nndsvda', max_iter=500, n_components=50,
                     random_state=42)),
                ('svc', SVC())])


using something as braindead as 100-year-old linear PCA and untuned GBM achieves >97% accuracy on MNIST.  
using a slightly more reasonable featurization approach gets us close to 98%.     

## Basic CNN

In [6]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

# convert class vectors to binary class matrices
y_train_ohe = keras.utils.to_categorical(y_train, num_classes)
y_test_ohe = keras.utils.to_categorical(y_test, num_classes)

In [13]:
# copypasta

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train_ohe, batch_size=batch_size, epochs=epochs, validation_split=0.1)

score = model.evaluate(x_test, y_test_ohe, verbose=0)
print("Base model")
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 13, 13, 32)       0         
 2D)                                                             
                                                                 
 conv2d_7 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 5, 5, 64)         0         
 2D)                                                             
                                                                 
 flatten_3 (Flatten)         (None, 1600)              0         
                                                                 
 dropout_4 (Dropout)         (None, 1600)             

In [16]:
# lightly tuned

# convert class vectors to binary class matrices
y_train_ohe = keras.utils.to_categorical(y_train, num_classes)
y_test_ohe = keras.utils.to_categorical(y_test, num_classes)

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        
        layers.RandomRotation(0.05),
        layers.RandomContrast(0.05),
        
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu", padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu", padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.BatchNormalization(),
        layers.Dropout(0.1),
        
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu", padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu", padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.BatchNormalization(),
        layers.Dropout(0.1),
        
        layers.Flatten(),
        
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 random_rotation_4 (RandomRo  (None, 28, 28, 1)        0         
 tation)                                                         
                                                                 
 random_contrast_4 (RandomCo  (None, 28, 28, 1)        0         
 ntrast)                                                         
                                                                 
 conv2d_12 (Conv2D)          (None, 28, 28, 32)        320       
                                                                 
 batch_normalization_12 (Bat  (None, 28, 28, 32)       128       
 chNormalization)                                                
                                                                 
 conv2d_13 (Conv2D)          (None, 28, 28, 32)        9248      
                                                      

In [17]:
batch_size = 128
epochs = 20

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train_ohe, batch_size=batch_size, epochs=epochs, validation_split=0.1)

score = model.evaluate(x_test, y_test_ohe, verbose=0)
print("Mildly tuned model")
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mildly tuned model
Test loss: 0.024742983281612396
Test accuracy: 0.9930999875068665


In [None]:
# basic: 99.19
# add 2x hidden 128: 99.23
# dropout .5->.3: 99.2
# add mild dropout (0.1) to ConvMaxpool layers: 99.1
# add some mild transformations: 99.15
# just rot + contrast: 99.36  
# looks less overfit-y, bump epochs 15->20: 99.46  
# single 32,64 vgg block -> 32,32 and 64,64 block: 99.50
# reduce first dense layer dropout .5->.3: 99.27