In [44]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
N = 100
D = 2
X = np.random.randn(N, D)

X[:50, :] = X[:50, :] - 2*np.ones((50, D))  # first 50 points to be centred at x = -2, y = -2 i.e. (-2, 2)
X[50:, :] = X[50:, :] + 2*np.ones((50, D))  # the last 50 points to be centred ar (2, 2)

T = np.array([0]*50 + [1]*50)  # 1st 50 set to 0 and next 50 set to 1

ones = np.array([[1]*N]).T

Xb = np.concatenate((ones, X), axis = 1) # axis = 1 by column

In [62]:
Xb[0:3]

array([[ 1.        , -1.62826271, -2.19338468],
       [ 1.        , -1.72176812, -1.54939791],
       [ 1.        , -3.04954825, -0.99874205]])

In [63]:
# Randomly initialize weights
w = np.random.randn(D + 1)  # D features and 1 bias term

# Calculate the model output
z = Xb.dot(w)

In [64]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [65]:
Y = sigmoid(z)

In [66]:
Y[0:10]

array([ 0.66621626,  0.54868782,  0.3327221 ,  0.47955688,  0.74602761,
        0.42356914,  0.2713722 ,  0.37904611,  0.79092609,  0.38434824])

In [67]:
# Cross Entropy Error Function

def cross_entropy(T, Y):  # T: Target (i.e. the True Class) and Y: Prediction (i.e. the output from the sigmoid func ...b/w 0 & 1)
    E = 0  # cross entropy error function
    for i in range(N):
        if T[i] == 1:
            E -= np.log(Y[i])
        else:
            E -= np.log(1 - Y[i])
    return E

In [68]:
# Gradient Descent. We will do 100 iterations
learning_rate = 0.1
for i in range(100):
    if i % 10 == 0:  # print cross entropy every 10 steps so that we see it is decreasing
        print(cross_entropy(T, Y))
    
    # gradient descent (this case it is Gradient Ascent) weight update
    # w = w - learning rate * { derivative of (negative of log likelihood)} which is equivalent to 
    # w = w + learning rate * { derivative of log likelihood)}
    w += learning_rate * Xb.T.dot(T - Y)  # calculating the weight, the last term is the gradient dJ/dW of the Cross Entropy Error fnc.
    
    # recalculate Y
    Y = sigmoid(Xb.dot(w))

119.827251683
0.001622955019
0.00161898818787
0.00161504137614
0.0016111144319
0.00160720720479
0.00160331954596
0.00159945130805
0.00159560234518
0.00159177251293


In [69]:
# We can see that the value of our error term decreases with each iteration of Gradient Descent.

In [70]:
# Let's see the optimal value of w
print("Final w: ", w)

Final w:  [  0.43916758  12.7402527   14.26925868]


In [None]:
# so, the 1st value is 0 same as the closed solution, but the other two values seem to be very large. We will address that shortly.