# Gradient Descent in Logistic Regression

## Traditional Algorithm (using for loops)

In [None]:
import numpy as np

# m = the size of the dataset
# n = the number of data in one group
# w = the vector for features, (n, 1)
# b = the intercepter value
# x = the vector of one group of data in the dataset, x^(i), (n, 1)
# z = output value of the linear transformation, z^(i)
# a = output value of the activation function, the predicted value, a^(i)
# J = the overall average loss
# y = one actual value in a group of data, y^(i)
# dz = the value for dL/dz in one data group, dz^(i)
# dw = the vector for the average dL/dw_i in the dataset, (n, 1)
# db = the value for the average dL/db in the dataset

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
# for one gradient descent

n = 2
J = 0, b = 0
dw = np.zeros(n, 1)

for i in range(m):
    z = np.dot(np.transpose(w), x) + b # z^(i) = w^T * x^(i) + b
    a = sigmoid(z)
    J += -(y * np.log(a) + (1-y) * np.log(1-a))
    dz = a - y
    dw += x * dz
    db += dz

J /= m
dw /= m
db /= m
    

## Vectorization Algorithm (without explicit for loops)

In [None]:
import numpy as np

# m = the size of the dataset
# n = the number of data in one group
# w = the vector for features, (n, 1)
# b = the intercepter value
# X = the dataset matrix, [x^(1), x^(2), ..., x^(m)], (n, m)
# Z = output vector of the linear transformation, [z^(1), z^(2), ..., z^(m)], (1, m)
# A = output vector of the activation function, the predicted values, [a^(1), a^(2), ..., a^(m)], (1, m)
# J = the overall average loss
# Y = the vector of actual values, [y^(1), y^(2), ..., y^(m)], (m, 1)
# dZ = the vector of dL/dz*(i), [dz^(1), dz^(2), ..., dz^(m)], (1, m)
# dw = the vector for the average dL/dw_i in the dataset, (n, 1)
# db = the value for the average dL/db in the dataset
# alpha = the learning rate


In [None]:
# for one gradient descent

b = 0
dw = np.zeros(n, 1)

# front propagation
Z = np.dot(np.transpose(w), X) + b
A = sigmoid(Z)
J = -(np.dot(Y, np.log(A)) + np.dot((1 - Y), np.log(1 - A))) / m

# back propagation
dZ = A - Y
dw = np.dot(X, np.transpose(dZ)) # dw = [x^(1) * dz^(1) + x^(2) * dz^(2) + ... + x^(m) * dz^(m)]
b = np.sum(dZ)
dw /= m
b /= m

w -= np.dot(alpha, dw)
b -= alpha * db

# for p times of gradient descent, a for loop is still needed