In [3]:
import numpy as np

In [4]:
N = 100
D = 2

X = np.random.randn(N, D)

# Center the 1st 50 points at (-2, -2)
X[:50, :] = X[:50, :] - 2*np.ones((50, D))

# Center the last 50 points at (2, 2)
X[50:, :] = X[50:, :] + 2*np.ones((50, D))

# Labels : first 50 are 0 and last 50 are 1
T = np.array([0]*50 + [1]*50)

# Add a column of ones which is the bias term
ones = np.ones((N, 1))
Xb = np.concatenate((ones, X), axis = 1)

In [5]:
# Randomly initialize the weights
w = np.random.randn(D + 1)

# Calculate the model output-
z = Xb.dot(w)

# Defining the sigmoid-
def sigmoid(z):
    return 1/(1 + np.exp(-z))

Y = sigmoid(z)

In [8]:
Y.shape

(100,)

In [9]:
Y[0:10]

array([ 0.80228785,  0.82521011,  0.78593005,  0.7975942 ,  0.74615277,
        0.79880715,  0.8048046 ,  0.81400267,  0.77364124,  0.92784619])

In [10]:
# Calculate the Cross Entropy Error
def cross_entropy(T, Y):
    E = 0
    for i in range(N):
        if T[i] == 1:
            E -= np.log(Y[i])
        else:
            E -= np.log(1 - Y[i])
    return E

In [11]:
# Let's do the Gradient Descent 100 times
learning_rate = 0.1
for i in range(100):
    if i % 10 == 0:
        print(cross_entropy(T, Y))
        
    # Gradient descent weight update with regularization
    w += learning_rate * (Xb.T.dot(T - Y) - 0.1*w)
    
    # recalculate Y
    Y = sigmoid(Xb.dot(w))

157.644206873
1.56241673687e-05
4.51865668077e-05
0.000118275578087
0.000283000045228
0.000624562302814
0.00128151359433
0.00246184891091
0.00445450708055
0.00763021839886


In [12]:
print("Final w: ", w)

Final w:  [-0.03623634  5.66075953  6.28959367]


In [None]:
# So, we can see that the weights have come very close to [0, 4, 4] which we get from the Closeed form of solution.
# The values of w are smaller now.
# Since we assume that it is normally distributed around 0, we get the values of weights much closer to zero.