### 1. Import Libraries

In [1]:
import os
import sys
import warnings

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from numba import cuda
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

Matplotlib is building the font cache; this may take a moment.


### 2. Import from mlcblab

In [2]:
from mlcvlab.models.nn4 import NN4
from mlcvlab.nn.losses import l2
from mlcvlab.optim.sgd import SGD
from mlcvlab.optim.sync_sgd import sync_sgd
# TODO: Import all the necessary code from mlcvlab package as you need... 

### 3. Set Seed

In [3]:
np.random.seed(42)

### 4. Helper functions

In [4]:
def load_dataset():
    x, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
    y = np.asarray(y, dtype=np.float64)
    return x, y

def prepare_data(x, y):
    y = np.apply_along_axis(lambda num: num % 2 == 0, 0, y).astype("int")
    return x, y

def split_train_test(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=10000, random_state=0)
    return X_train, X_test, y_train, y_test

def minibatch(X_train,y_train,K):

    X_train_batches = []
    y_train_batches = []
    y_train=y_train.reshape(-1,1)
    
    assert len(X_train) == len(y_train), "Input and output data should have the same length."
    data_size = len(X_train)
    indices = np.arange(data_size)
    np.random.shuffle(indices)#random shuffling for distributed data

    for start_idx in range(0, data_size, K):
        end_idx = min(start_idx + K, data_size)
        batch_indices = indices[start_idx:end_idx]
        mini_batch_x=X_train[batch_indices,:]
        mini_batch_y=y_train[batch_indices,:]
        X_train_batches.append(mini_batch_x)
        y_train_batches.append(mini_batch_y)

    return X_train_batches, y_train_batches

def initialize_model():
    #TODO (Can use the similar approach used in HW1)
    # e.g. He Initialization for W0-W2, Xavier Initialization for W3
    # Also, initialize your model with a dropout parameter of 0.25 and use_batchnorm being true.
    
    #Xavier Weight Initialization
    n_in_1= 300
    n_out_1=784
    std_1 = np.sqrt(2) / np.sqrt(n_in_1 + n_out_1)
    W1 = np.random.normal(loc=0, scale=std_1, size=(n_in_1, n_out_1)) #(300,784)
    
    n_2 = 150
    std_2 = np.sqrt(2) / np.sqrt(n_2 + n_in_1)
    W2 = np.random.normal(loc=0, scale=std_2, size=(n_2, n_in_1)) #(150,300)
    
    n_3 = 50
    std_3 = np.sqrt(2) / np.sqrt(n_3 + n_2)
    W3 = np.random.normal(loc=0, scale=std_1, size=(n_3, n_2)) #(50,150)
    
    n_4 = 1
    std_4 = np.sqrt(2) / np.sqrt(n_4 + n_3)
    W4 = np.random.normal(loc=0, scale=std_1, size=(n_4, n_3)) #(1,50)
    
    print(f"Size of W1 : {W1.shape}, Size of W2 : {W2.shape}, Size of W3 : {W3.shape}, Size of W4 : {W4.shape}")
    four_layer_nn  = NN4(use_batchnorm=True, dropout_param=0.25)
    four_layer_nn.layers[0].W = W1
    four_layer_nn.layers[1].W = W2
    four_layer_nn.layers[2].W = W3
    four_layer_nn.layers[3].W = W4

    return four_layer_nn

def train_model(model, X_train_batches, y_train_batches):
    print("The gradient of emperical risk for W1, W2, W3 and W4:")
    griddim = (64,64)
    blockdim = (256,256)

    stream = cuda.stream()

    d_X_train_batches = cuda.to_device(X_train_batches, stream=stream)
    d_y_train_batches = cuda.to_device(y_train_batches, stream=stream)

    model_sync = sync_sdg[griddim, blockdim](model, d_X_train_batches, d_y_train_batches)
    model_sync = model_sync.copy_to_host()

    model_async = async_sgd[griddim, blockdim, stream](model, d_X_train_batches, d_y_train_batches)
    model_async = model_async.copy_to_host(stream=stream)

    return model_async, model_sync

def test_model(model, X_test, y_test):
    
    y_hat = model.nn4(X_test, mode='test')[0].T
    y_test = y_test.reshape(-1, 1)
    error = np.round_(np.abs(y_test - y_hat), 2)
    indicator_fn = np.greater(error, np.zeros(error.shape)).astype('float')

    accuracy = np.mean(indicator_fn) * 100
    return accuracy

### 5. Run the program

In [None]:
# minibatch size
K = 100

#load data
x, y = load_dataset()
print("Dataset imported successfully")

#prepare data
x, y = prepare_data(x,y)
print("Data prepared successfully")

# split data set
X_train, X_test, y_train, y_test = split_train_test(x,y)
print("Dataset Splitted successfully")

#initialize model
model = initialize_model()
print("NN4 successfully implemented")

#dividing data into mini batches
X_train_batches, y_train_batches = minibatch(X_train, y_train, K)
print("Data successfully split into mini batches")

#training model
sgd = SGD(model, X_train_batches, y_train_batches, lr=0.01, R=50)
print(f"Completed training with SGD, now testing...")

#testing model
accuracy = test_model(sgd, X_test, y_test)
print(f"Completed testing model using SGD - Accuracy : {accuracy}")

#training model with cuda
async_model, sync_model = train_model(model, X_train_batches, y_train_batches)
print(f"Completed training model - final W : {final_W}")
print(f"Completed training, now testing...")   

#testing model
# accuracy_async = test_model(model_async, X_test, y_test)
# print(f"Completed testing model using asynchronous SGD - Accuracy : {accuracy_async}")   

accuracy_sync = test_model(sync_model, X_test, y_test)
print(f"Completed testing model using synchronous SGD - Accuracy : {accuracy_sync}")