In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Sequential
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.activations import sigmoid
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# def sigmoid_derivative(z):
#     return np.exp(-z) / ((1 + np.exp(-z)) ** 2)

def sigmoid_derivative_2(a):
    return a * (1 - a)

def cost_squared(y_expected, y_predicted):
    return (y_expected - y_predicted) ** 2 / 2

def cost_squared_derivative(y_expected, y_predicted):
    return -(y_expected - y_predicted)

def cost_inner(y_expected, y_predicted):
    return y_expected * np.log(y_predicted) + (1 - y_expected) * np.log(1 - y_predicted)

def cost_derivative(y_expected, y_predicted):
    top = y_expected - y_predicted
    bottom = y_predicted * (1 - y_predicted)
    return top / bottom

# I expect x_train to be an array of [x_1, x_2] vectors
# y_train is an array of scalar values
# W_A, W_B and W_C are [w1, w2] vectors
def model(x_train, y_train, W_A, W_B, W_D, W_C, B, training_rate):    
    # b1_tmp = np.array( [-9.82, -9.28,  0.96] )
    # b2_tmp = np.array( [15.41] )
    # #B_A = B_B = B_C = B_D = 0
    # (B_A, B_B, B_D) = b1_tmp
    # B_C = 15.41

    (B_A, B_B, B_D, B_C) = B

    m = x_train.shape[0]
    J = 0
    dwa = np.zeros(2)
    dwb = np.zeros(2)
    dwd = np.zeros(2)
    dwc = np.zeros(3)

    dba = dbb = dbd = dbc = 0;

    for i in range(m):
        a0_1 = x_train[i][0]
        a0_2 = x_train[i][1]

        z1_A = W_A[0] * a0_1 + W_A[1] * a0_2 + B_A
        z1_B = W_B[0] * a0_1 + W_B[1] * a0_2 + B_B
        z1_D = W_D[0] * a0_1 + W_D[1] * a0_2 + B_D

        dz1_A_dw0 = a0_1
        dz1_A_dw1 = a0_2

        dz1_B_dw0 = a0_1
        dz1_B_dw1 = a0_2

        dz1_D_dw0 = a0_1
        dz1_D_dw1 = a0_2
        
        a1_A = sigmoid(z1_A)
        a1_B = sigmoid(z1_B)
        a1_D = sigmoid(z1_D)

        da1_A_z1_A = sigmoid_derivative_2(a1_A)
        da1_B_z1_B = sigmoid_derivative_2(a1_B)
        da1_D_z1_D = sigmoid_derivative_2(a1_D)

        z2 = W_C[0] * a1_A + W_C[1] * a1_B + W_C[2] * a1_D + B_C

        dz2_a1_A = W_C[0]
        dz2_a1_B = W_C[1]
        dz2_a1_D = W_C[2]

        dz2_w0 = a1_A
        dz2_w1 = a1_B
        dz2_w2 = a1_D

        a2 = sigmoid(z2)

        da2_z2 = sigmoid_derivative_2(a2)

        diff_plain = a2 - y_train[i]
        J = J + diff_plain / m

        err = diff_plain

        tmp_1 = err * da2_z2
        dj_dwc_0 = tmp_1 * dz2_w0
        dj_dwc_1 = tmp_1 * dz2_w1
        dj_dwc_2 = tmp_1 * dz2_w2

        dj_dbc = tmp_1
        dbc = dbc + dj_dbc / m

        dwc[0] = dwc[0] + dj_dwc_0 / m
        dwc[1] = dwc[1] + dj_dwc_1 / m
        dwc[2] = dwc[2] + dj_dwc_2 / m

        tmp_2 = tmp_1 * dz2_a1_B * da1_B_z1_B
        dj_dwb_0 = tmp_2 * dz1_B_dw0
        dj_dwb_1 = tmp_2 * dz1_B_dw1
        dwb[0] = dwb[0]  + dj_dwb_0 / m
        dwb[1] = dwb[1]  + dj_dwb_1 / m

        dj_dbb = tmp_2
        dbb = dbb + dj_dbb / m

        tmp_3 = tmp_1 * dz2_a1_A * da1_A_z1_A
        dj_dwa_0 = tmp_3 * dz1_A_dw0
        dj_dwa_1 = tmp_3 * dz1_A_dw1
        dwa[0] = dwa[0]  + dj_dwa_0 / m
        dwa[1] = dwa[1]  + dj_dwa_1 / m
        dj_dba = tmp_3
        dba = dba + dj_dba / m

        tmp_4 = tmp_1 * dz2_a1_D * da1_D_z1_D
        dj_dwd_0 = tmp_4 * dz1_D_dw0
        dj_dwd_1 = tmp_4 * dz1_D_dw1
        dwd[0] = dwd[0]  + dj_dwd_0 / m
        dwd[1] = dwd[1]  + dj_dwd_1 / m
        dj_dbd = tmp_4
        dbd = dbd + dj_dbd / m

    return (
        W_A - training_rate * dwa, 
        W_B - training_rate * dwb, 
        W_D - training_rate * dwd, 
        W_C - training_rate * dwc,
        B_A - training_rate * dba,
        B_B - training_rate * dbb,
        B_D - training_rate * dbd,
        B_C - training_rate * dbc,
        J, 
        a2)


In [None]:
def predict(x1, x2, W_A, W_B, W_D, W_C):    
    B_A = B_B = B_C = B_D = 0

    a0_1 = x1
    a0_2 = x2

    z1_1 = W_A[0] * a0_1 + W_A[1] * a0_2 + B_A
    z1_2 = W_B[0] * a0_1 + W_B[1] * a0_2 + B_B
    z1_3 = W_D[0] * a0_1 + W_D[1] * a0_2 + B_D

    a1_1 = sigmoid(z1_1)
    a1_2 = sigmoid(z1_2)
    a1_3 = sigmoid(z1_3)

    z2_1 = W_C[0] * a1_1 + W_C[1] * a1_2 + B_C + W_C[2] + a1_3

    a2_1 = sigmoid(z2_1)

    return a2_1

In [None]:
def create_coffee_dataset():
    """
    Copied from lab_coffee_utils
    """
    rng = np.random.default_rng(2)
    X = rng.random(400).reshape(-1,2)
    X[:,1] = X[:,1] * 4 + 11.5          # 12-15 min is best
    X[:,0] = X[:,0] * (285-150) + 150  # 350-500 F (175-260 C) is best
    Y = np.zeros(len(X))
    
    i=0
    for t,d in X:
        y = -3/(260-175)*t + 21
        if (t > 175 and t < 260 and d > 12 and d < 15 and d<=y ):
            Y[i] = 1
        else:
            Y[i] = 0
        i += 1

    return (X, Y.reshape(-1,1))

In [None]:
x_train, y_train = create_coffee_dataset()
print(x_train.shape, y_train.shape)

In [20]:
W_A = np.random.randn(2) #np.array([0.3092, 0.134])
W_B = np.random.randn(2) #np.array([0.233, 0.7862])
W_D = np.random.randn(2) #np.array([0.233, 0.7862])
W_C = np.random.randn(3) #np.array([0.3674, 0.63])
B = np.random.randn(4)
(B_A, B_B, B_D, B_C) = B
J = 1

norm_l = tf.keras.layers.Normalization(axis=-1)
norm_l.adapt(x_train)

sample_data = norm_l(x_train).numpy()

epochs = 10000
training_rate = 0.001

counter = 1.0
while abs(J) > 0.00001:
    W_A, W_B, W_D, W_C, B_A, B_B, B_D, B_C, J, a2 = model(sample_data, y_train, W_A, W_B, W_D, W_C, B, training_rate)
    B = np.array([B_A, B_B, B_D, B_C])
    counter = counter + 1
    if counter % 1000 == 0:
        print(f'{counter} runs completed, J = {J}')
        if counter > 1.5 * 10 ** 5:
            break;

print(f'It took {counter} runs to get J = {J}')
print(W_A, W_B, W_D, W_C, B_A, B_B, B_D, B_C, J, a2)

# Prediction
# m = len(x_train)

# inputs = norm_l(x_train).numpy()
# B = np.array([B_A, B_B, B_D, B_C])

# for i in range(m):
#     sample = inputs[i]
#     known_result = y_train[i]
#     _,_,_,_,_,_,_,_,J,prediction = model(np.array([sample]), known_result, W_A, W_B, W_D, W_C, B, 1)
#     print(f'[{i}] Y = {known_result}, prediction = {1 if prediction > 0.7 else 0 }, J = {J}')


1000.0 runs completed, J = [-0.0043319]
2000.0 runs completed, J = [-0.00486049]


KeyboardInterrupt: 

W_A = [-1.62277548  0.24141038]
W_B = [-0.18696642 -0.86103547]
W_D = [-1.02366511 -0.02199428]
W_C = [-0.52593289  1.29878606 -0.22726566]
B_A = [0.52489947]
B_B = [0.63349436]
B_D = [0.59407006]
B_C = [-1.41554785]
J = [0.049998]
prediction = [0.35265101]

In [None]:
m = len(x_train)

inputs = norm_l(x_train).numpy()
B = np.array([B_A, B_B, B_D, B_C])

print(f'W_A = {W_A}')
print(f'W_B = {W_B}')
print(f'W_D = {W_D}')
print(f'W_C = {W_C}')
print(f'B = {B.T}')

for i in range(m):
    sample = inputs[i]
    known_result = y_train[i]
    _,_,_,_,_,_,_,_,J,prediction = model(np.array([sample]), known_result, W_A, W_B, W_D, W_C, B, 1)
    print(f'[{i}] Y = {known_result}, prediction = {1 if prediction > 0.7 else 0 }, J = {J}')


In [None]:
W_1 = np.array([-1.62277548, 0.24141038])
W_2 = np.array([-0.18696642, -0.86103547])
W_3 = np.array([-1.02366511, -0.02199428])
W_4 = np.array([-0.52593289,  1.29878606, -0.22726566])
B_1 = 0.52489947
B_2 = 0.63349436
B_3 = 0.59407006
B_4 = -1.41554785

B_0 = np.array([B_1, B_2, B_3, B_4])

m = len(x_train)

inputs = norm_l(x_train).numpy()

for i in range(m):
    sample = inputs[i]
    known_result = y_train[i]
    _,_,_,_,_,_,_,_,J,prediction = model(np.array([sample]), known_result, W_1, W_2, W_3, W_4, B_0, 1)
    print(f'[{i}] Y = {known_result}, prediction = {1 if prediction > 0.7 else 0 }, J = {J}')


In [None]:
# Known good weights
W1_tmp = np.array( [[-8.93,  0.29, 12.9 ], [-0.1,  -7.32, 10.81]] )
W2_tmp = np.array( [[-31.18], [-27.59], [-32.56]] )

W_A = np.array([-8.93, -0.1])
W_B = np.array([0.29, -7.32])
W_D = np.array([12.9, 10.81])
W_C = np.array([-31.18, -27.59, -32.56])

b1_tmp = np.array( [-9.82, -9.28,  0.96] )
b2_tmp = np.array( [15.41] )
B = np.array( [-9.82, -9.28,  0.96, 15.41] )
(B_A, B_B, B_D, B_C) = B

for i in range(m):
    sample = inputs[i]
    known_result = y_train[i]
    _,_,_,_,_,_,_,_,J,prediction = model(np.array([sample]), known_result, W_A, W_B, W_D, W_C, B, 1)
    print(f'[{i}] Y = {known_result}, prediction = {1 if prediction > 0.7 else 0 }, J = {J}')
