In [1]:
import numpy as np

In [2]:
class Dense_Layer:
    def __init__(self,n_inputs,n_neurons):
        self.weights=0.01*np.random.randn(n_inputs,n_neurons)
        self.biases=np.zeros((1,n_neurons))

    def forward(self,inputs):
        self.inputs=inputs
        self.output=np.dot(inputs,self.weights)+self.biases

    def backward(self,dvalues):
        self.dweights=np.dot(self.inputs.T,dvalues)
        self.dbiases=np.sum(dvalues,axis=0,keepdims=True)
        self.dinputs=np.dot(dvalues,self.weights.T)
        

In [3]:
class Activation_Relu:
    def forward(self,inputs):
        self.inputs=inputs
        self.output=np.maximum(0,inputs)
   
    def backward(self,dvalues):
        self.dinputs=dvalues.copy()
        self.dinputs[self.inputs <=0] =0

In [4]:
class Activation_Softmax:
    def forward(self,inputs):
        exp_values=np.exp(inputs- np.max(inputs , axis=1, keepdims=True))
        probabilities=exp_values/np.sum(exp_values,axis=1,keepdims=True)
        self.output=probabilities                          

In [5]:
class Loss:
    def calculate(self,output,y):
        sample_losses=self.forward(output,y)
        data_loss=np.mean(sample_losses)
        return data_loss

In [6]:
class Categorical_CrossEntropyLoss(Loss):
    def forward(self,y_pred,y_true):
        samples=len(y_pred)

        y_pred_clipped=np.clip(y_pred,1e-7,1-1e-7)

        if len(y_true.shape)==1:
            correct_confidences= y_pred_clipped[
            range(samples),
            y_true
            ]
        elif len(y_true.shape)==2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        neg_log_likelihoods = -np.log(correct_confidences)
        return neg_log_likelihoods
    def backward(self,dvalues,y_true):
        samples=len(dvalues)
        labels=len(dvalues[0])

        if len(y_true.shape)==1:
            y_true=np.eye(labels)[y_true]

        self.dinputs= -y_true/dvalues
        self.dinputs= self.dinputs/samples      

In [19]:
class Activation_softmax_Loss_CategoricalCrossEntropy:
    def __init__(self):
        self.activation= Activation_Softmax()
        self.loss= Categorical_CrossEntropyLoss()

    def forward(self,inputs,y_true):
        self.activation.forward(inputs)
        self.output=self.activation.output
        return self.loss.calculate(self.output, y_true)
    def backward(self,dvalues,y_true):
        samples=len(dvalues)
        if len(y_true.shape)==2:
            y_true = np.argmax(y_true,axis=1)
        self.dinputs=dvalues.copy()
        self.dinputs[range(samples),y_true]-=1
        self.dinputs=self.dinputs/samples
    

In [25]:
import numpy as np

class Optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))


    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)


        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

   
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2


        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [10]:
def create_data(samples, classes):
    X = np.zeros((samples*classes, 2))
    y = np.zeros(samples*classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(samples*class_number, samples*(class_number+1))
        r = np.linspace(0.0, 1, samples)
        t = np.linspace(class_number*4, (class_number+1)*4, samples) + np.random.randn(samples)*0.2
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = class_number
    return X, y

In [11]:
X,y=create_data(samples=100,classes=3)

In [26]:
dense1= Dense_Layer(2,64)
activation_1=Activation_Relu()
dense2=Dense_Layer(64,3)
loss_activation=Activation_softmax_Loss_CategoricalCrossEntropy()
optimizer=Optimizer_Adam(learning_rate=0.02,decay=1e-5)

for epoch in range(10001):
    dense1.forward(X)
    activation_1.forward(dense1.output)
    dense2.forward(activation_1.output)
    loss=loss_activation.forward(dense2.output,y)

    predictions=np.argmax(loss_activation.output,axis=1)
    if len(y.shape)==2:
        y=np.argmax(y,axis=1)
    accuracy=np.mean(predictions==y)

    if not epoch% 100:
        print(f'epoch:{epoch},'+ f'acc:{accuracy},'+f'loss:{loss}')
    loss_activation.backward(loss_activation.output ,y)
    dense2.backward(loss_activation.dinputs)
    activation_1.backward(dense2.dinputs)
    dense1.backward(activation_1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch:0,acc:0.34,loss:1.0986060725028328
epoch:100,acc:0.4533333333333333,loss:0.9634509101189161
epoch:200,acc:0.57,loss:0.8500221335432286
epoch:300,acc:0.72,loss:0.7111361775122444
epoch:400,acc:0.7333333333333333,loss:0.6297734066323005
epoch:500,acc:0.73,loss:0.5856513152101055
epoch:600,acc:0.76,loss:0.5543730507220507
epoch:700,acc:0.83,loss:0.47617879597691704
epoch:800,acc:0.8133333333333334,loss:0.4322787922237544
epoch:900,acc:0.8466666666666667,loss:0.4007855203577691
epoch:1000,acc:0.8633333333333333,loss:0.37985788059096864
epoch:1100,acc:0.8666666666666667,loss:0.35625692714709867
epoch:1200,acc:0.87,loss:0.3418395183005995
epoch:1300,acc:0.8666666666666667,loss:0.331655265071317
epoch:1400,acc:0.8766666666666667,loss:0.32086391945722537
epoch:1500,acc:0.87,loss:0.3133872771915697
epoch:1600,acc:0.8733333333333333,loss:0.30576834564459887
epoch:1700,acc:0.8533333333333334,loss:0.30574247030620727
epoch:1800,acc:0.8766666666666667,loss:0.29560641195275117
epoch:1900,acc:0