In [1]:
import numpy as np
import matplotlib.pyplot as plt

import scipy
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from typing import Tuple

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

seed = 47

X, y = fetch_openml('CIFAR_10', as_frame=False, return_X_y=True)

y_int = y.astype(int)
X_tr, X_te, y_tr, y_te = train_test_split(X, y_int, test_size=0.25, random_state=seed, shuffle=True)

def scale_data(X_tr: np.array, X_te: np.array) -> tuple[np.array, np.array]:
    scaler = StandardScaler()
    scaler.fit(X_tr)
    X_tr_scaled = scaler.transform(X_tr)
    X_te_scaled = scaler.transform(X_te)
    return X_tr_scaled, X_te_scaled

X_tr_scaled, X_te_scaled = scale_data(X_tr, X_te)

In [3]:
from sklearn.metrics import accuracy_score
def errors_for_train_sizes_mlp(X_tr: np.array, y_tr: np.array, X_te: np.array, y_te: np.array, seed: int, train_sizes: list[int]) -> tuple[list, list, list, list]:
    tr_err_mlp = []
    te_err_mlp = []

    mlp_params = {
        'hidden_layer_sizes': (64,),
        'activation': 'relu',
        'solver': 'sgd',
        'learning_rate_init': 0.001,
        'batch_size': 100,
        'random_state': seed,
        'max_iter': 1500
    }

    for n_tr in train_sizes:
        X_tr_subset = X_tr[:n_tr]
        y_tr_subset = y_tr[:n_tr]
        
        mlp = MLPClassifier(**mlp_params)
        mlp.fit(X_tr_subset, y_tr_subset)

        y_tr_pred = mlp.predict(X_tr_subset)
        tr_err = 1 - accuracy_score(y_tr_subset, y_tr_pred)
        tr_err_mlp.append(tr_err)
        
        y_te_pred = mlp.predict(X_te)
        te_err = 1 - accuracy_score(y_te, y_te_pred)
        te_err_mlp.append(te_err)
    return tr_err_mlp, te_err_mlp

In [4]:
def errors_for_train_sizes_lr(X_tr: np.array, y_tr: np.array, X_te: np.array, y_te: np.array, seed: int, train_sizes: list[int]) -> tuple[list, list, list, list]:    
    tr_err_lr = []
    te_err_lr = []

    for n_tr in train_sizes:
        X_tr_subset = X_tr[:n_tr]
        y_tr_subset = y_tr[:n_tr]
        
        lr = LogisticRegression(random_state=seed)
        lr.fit(X_tr_subset, y_tr_subset)
        
        y_tr_pred = lr.predict(X_tr_subset)
        tr_err = 1 - accuracy_score(y_tr_subset, y_tr_pred)
        tr_err_lr.append(tr_err)
        
        y_te_pred = lr.predict(X_te)
        te_err = 1 - accuracy_score(y_te, y_te_pred)
        te_err_lr.append(te_err)
  
    return tr_err_lr, te_err_lr

In [5]:
def plot_errors_for_train_sizes_mlp_lr(tr_err_mlp: list, te_err_mlp: list, tr_err_lr: list, te_err_lr: list, train_sizes: list[int]) -> None:
    plt.semilogx(train_sizes, tr_err_mlp, label='MLP Training Error', marker='x', color = "orange")
    plt.semilogx(train_sizes, te_err_mlp, label='MLP Testing Error', marker='x', color = "orange", linestyle = "--")

    plt.semilogx(train_sizes, tr_err_lr, label='Logistic Regression Training Error', marker='x', color = "blue")
    plt.semilogx(train_sizes, te_err_lr, label='Logistic Regression Testing Error', marker='x', color = "blue", linestyle = "--")

    plt.xlabel('Num. Training Data Points')
    plt.ylabel('Error Rate')

    plt.legend()

In [6]:
train_sizes = [50, 500, 2000, 5000]
tr_err_mlp, te_err_mlp = errors_for_train_sizes_mlp(X_tr_scaled, y_tr, X_te_scaled, y_te, seed, train_sizes)
tr_err_lr, te_err_lr = errors_for_train_sizes_lr(X_tr_scaled, y_tr, X_te_scaled, y_te, seed, train_sizes)
plot_errors_for_train_sizes_mlp_lr(tr_err_mlp, te_err_mlp, tr_err_lr, te_err_lr, train_sizes)

