In [3]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

feature = 2
classes = 2

#This function is used to define/Reset network(model)
def network(n_hidden=100):
    model = dict(
        W1=np.random.randn(feature, n_hidden),
        W2=np.random.randn(n_hidden, classes)
    )

    return model

#This is the softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


def forward(x, model):
    
    h = x @ model['W1']
    h[h < 0] = 0

    
    prob = softmax(h @ model['W2'])

    return h, prob


def backward(model, xs, hs, errors):
    dW2 = hs.T @ errors

    dh = errors @ model['W2'].T
    dh[hs < 0] = 0
    dW1 = xs.T @ dh

    return dict(W1=dW1, W2=dW2)


def get_minibatch_grad(model, X_train, y_train):
    xs, hs, errors = [], [], []

    for x, cls_idx in zip(X_train, y_train):#used to link two different tuples(zip function)
        h, y_pred = forward(x, model)#passing this to forward function and getting the predicted values from imt as return 

        y_true = np.zeros(classes)
        y_true[int(cls_idx)] = 1.
        error = y_true - y_pred

        xs.append(x)
        hs.append(h)
        errors.append(error)

    return backward(model, np.array(xs), np.array(hs), np.array(errors))

#function to create mini-batches
def get_minibatch(X, y, minibatch_size):
    minibatches = []

    X, y = shuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):#starting from 0 to the size in the step of minibatch-size
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches

#Sorcastic-Gradient-Descent Model

def sgd(model, X_train, y_train, minibatch_size):
    minibatches = get_minibatch(X_train, y_train, minibatch_size)#getting mini-batches from the function

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]#selecting a random mini-batch

        grad = get_minibatch_grad(model, X_mini, y_mini)

        for layer in grad:
            model[layer] += alpha * grad[layer] # updating the values using alpha as learning rate

    return model

# Momentum-SGD
def momentum(model, X_train, y_train, minibatch_size):
    velocity = {k: np.zeros_like(v) for k, v in model.items()}
    gamma = 0.9

    minibatches = get_minibatch(X_train, y_train, minibatch_size)

    for iter in range(1, n_iter + 1):#getting mini-batches and selecting one at random
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad = get_minibatch_grad(model, X_mini, y_mini)

        for layer in grad:
            velocity[layer] = gamma * velocity[layer] + alpha * grad[layer]# using the formule to update the values
            model[layer] += velocity[layer]

    return model


# Adagrad Model

def adagrad(model, X_train, y_train, minibatch_size):
    cache = {k: np.zeros_like(v) for k, v in model.items()}

    minibatches = get_minibatch(X_train, y_train, minibatch_size)

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad = get_minibatch_grad(model, X_mini, y_mini)

        for k in grad:
            cache[k] += grad[k]**2
            model[k] += alpha * grad[k] / (np.sqrt(cache[k]) + eps) # updating the values for Model

    return model

#RMS-Prop
def rmsprop(model, X_train, y_train, minibatch_size):
    cache = {k: np.zeros_like(v) for k, v in model.items()}
    gamma = 0.9

    minibatches = get_minibatch(X_train, y_train, minibatch_size)

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad = get_minibatch_grad(model, X_mini, y_mini)

        for k in grad:
            cache[k] = gamma * cache[k] + (1 - gamma) * (grad[k]**2)
            model[k] += alpha * grad[k] / (np.sqrt(cache[k]) + eps)

    return model

# Adam Model
def adam(model, X_train, y_train, minibatch_size):
    M = {k: np.zeros_like(v) for k, v in model.items()}
    R = {k: np.zeros_like(v) for k, v in model.items()}
    beta1 = 0.9
    beta2 = 0.999

    minibatches = get_minibatch(X_train, y_train, minibatch_size)

    for iter in range(1, n_iter + 1):
        t = iter
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad = get_minibatch_grad(model, X_mini, y_mini)

        for k in grad:
            M[k] = beta1 * M[k] + (1. - beta1) * grad[k]
            R[k] = beta2 * R[k] + (1. - beta2) * grad[k]**2

            m_k_hat = M[k] / (1. - beta1**(t))
            r_k_hat = R[k] / (1. - beta2**(t))

            model[k] += alpha * m_k_hat / (np.sqrt(r_k_hat) + eps)

    return model

# Randomising the values
def shuffle(X, y):
    Z = np.column_stack((X, y))
    np.random.shuffle(Z)
    return Z[:, :-1], Z[:, -1]


if __name__ == '__main__':
    X, y = make_moons(n_samples=5000, random_state=42, noise=0.1) # Creating dataset

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) #splitting into train and test

    n_iter = 100
    eps = 1e-8  # Smoothing to avoid division by zero so giving a very small value
    alpha = 1e-2
    minibatch_size = 100
    n_experiment = 3

    algos = dict(
        sgd=sgd,
        momentum=momentum,
        adagrad=adagrad,
        rmsprop=rmsprop,
        adam=adam
    )

    algo_accs = {k: np.zeros(n_experiment) for k in algos}

    for algo_name, algo in algos.items():
        print('Experimenting on {}'.format(algo_name))

        for k in range(n_experiment):
            # print('Experiment-{}'.format(k))

            # Reset model
            model = network()
            model = algo(model, X_train, y_train, minibatch_size)

            y_pred = np.zeros_like(y_test)

            for i, x in enumerate(X_test):
                _, prob = forward(x, model)
                y = np.argmax(prob)
                y_pred[i] = y

            algo_accs[algo_name][k] = np.mean(y_pred == y_test)

    print()

    for k, v in algo_accs.items():
        print('{} => mean accuracy: {}, std: {}'.format(k, v.mean(), v.std()))


Experimenting on sgd
Experimenting on momentum
Experimenting on adagrad
Experimenting on rmsprop
Experimenting on adam

sgd => mean accuracy: 0.8781333333333333, std: 0.0016438437341250594
momentum => mean accuracy: 0.7589333333333333, std: 0.14745678086213
adagrad => mean accuracy: 0.8429333333333333, std: 0.017784887467235267
rmsprop => mean accuracy: 0.8805333333333333, std: 0.0016438437341250653
adam => mean accuracy: 0.8754666666666666, std: 0.002099735433069792
