In [25]:
# Onur Can 
# Project is done for Prof. Mehmet Gönen's DASC 521: Introduction to Machine Learning @ Koç University MSc Data Science Program
# Thanks Prof Mehmet for the dataset generation and instructions

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def safelog (x):
    return(np.log(x + 1e-100))

## Parameters

In [26]:
#random seed 421 to generate data point
np.random.seed(666)
#mean parameters for synthetic data
class_means = np.array([[+0.0, +2.5],
                        [-2.5, -2.0],
                        [+2.5, -2.0]])
print(class_means)
#covariance parameters for synthetic data
class_covariances = np.array([[[+3.2 , +0.0],
                               [+0.0, +1.2]],
                              [[+1.2, +0.8],
                               [0.8, +1.2]],
                              [[+1.2, -0.8],
                               [-0.8, +1.2]]])
print(class_covariances)
class_sizes = np.array([120, 80, 100])
print(class_sizes)

## Data Generation

In [27]:
#generate random samples with seed 421
np.random.seed(666)

points1 = np.random.multivariate_normal(class_means[0,:], class_covariances[0,:,:], class_sizes[0]) # Red
points2 = np.random.multivariate_normal(class_means[1,:], class_covariances[1,:,:], class_sizes[1]) # Green
points3 = np.random.multivariate_normal(class_means[2,:], class_covariances[2,:,:], class_sizes[2]) # Blue
X = np.vstack((points1, points2, points3))

#Generate corresponding labels
y = np.concatenate((np.repeat(1, class_sizes[0]), np.repeat(2, class_sizes[1]), np.repeat(3, class_sizes[2]))).astype(int)
print(y,y.shape)
print(X.shape)

#number of classes and number of samples
N = X.shape[0]
K = np.max(y)
print(N, K)

# one-of-K encoding
Y_truth = np.zeros((N, K)).astype(int)
Y_truth[range(N), y - 1] = 1
#print(Y_truth)

## Plotting Training Data

In [28]:
#plotting operations to visualize initial data
plt.figure( figsize = (6,6))
plt.plot(points1[:,0],points1[:,1],"r.",markersize = 10)
plt.plot(points2[:,0],points2[:,1],"g.",markersize = 10)
plt.plot(points3[:,0],points3[:,1],"b.",markersize = 10)
plt.xlabel("x1")
plt.ylabel("x2")
plt.grid()
plt.show()

## Algorithm Parameters

In [29]:
#set learning parameters
eta = 0.01 #step size 
epsilon = 0.001 #if change in parameter smaller than epsilon stop

## Parameter Initialization

In [30]:
#randomly select W and w0
np.random.seed(666)
W = np.random.uniform(low = -0.01, high = +0.01, size = (X.shape[1], K))
w0 = np.random.uniform(low = -0.01, high = +0.01, size = (1, K))

print(W,W.shape,w0,w0.shape)
#print(np.vstack((W, w0)))    # to be used in matrix multiplication
#print(np.hstack((X,np.ones((N,1))))) # to be used in matrix multiplication

## Sigmoid Function

$\textrm{sigmoid}(\boldsymbol{w}^{\top} \boldsymbol{x} + w_{0}) = \dfrac{1}{1 + \exp\left[-(\boldsymbol{w}^{\top} \boldsymbol{x} + w_{0})\right]}$

In [31]:
#define sigmoid function
def sigmoid(X, W, w0):
    scores = 1 / (1 + np.exp(-(np.matmul(X, W) + w0)))     #w.t * x = x * w
    #scores = np.amax(scores,axis = 1, keepdims = True)
    return(scores)

## Estimated Gradient Functions


\begin{align*}
\dfrac{\partial \textrm{Error}}{\partial \boldsymbol{w}_{c}} &= \sum\limits_{i = 1}^{N} (y_{ic} - \widehat{y}_{ic})*\widehat{y}_{ic}*(1 - \widehat{y}_{ic})\boldsymbol{x}_{i} \\
\dfrac{\partial \textrm{Error}}{\partial w_{c0}} &= \sum\limits_{i = 1}^{N} (y_{ic} - \widehat{y}_{ic})*\widehat{y}_{ic}*(1 - \widehat{y}_{ic}) \\
\end{align*}


In [32]:
# define the gradient functions
# Y TRUTH is a matrix
#
def gradient_W(X, Y_truth, Y_predicted):
    return(np.asarray([np.matmul(((Y_truth[:,c] - Y_predicted[:,c])* Y_predicted[:,c]) * (1 - Y_predicted[:,c]), X)
                                  for c in range(K)]).transpose())

def gradient_w0(Y_truth, Y_predicted):
    return(np.sum((Y_truth - Y_predicted)*(Y_predicted)*( 1 - Y_predicted), axis = 0))



In [33]:
# learn W and w0 using gradient descent
iteration = 1
objective_values = []
while 1:
    Y_predicted = sigmoid(X, W, w0)

    objective_values = np.append(objective_values, -np.sum(Y_truth * safelog(Y_predicted)))
    W_old = W
    w0_old = w0

    W = W + eta * gradient_W(X, Y_truth, Y_predicted)
    w0 = w0 + eta * gradient_w0(Y_truth, Y_predicted)

    if np.sqrt(np.sum((w0 - w0_old))**2 + np.sum((W - W_old)**2)) < epsilon:
        break
        
    iteration = iteration + 1
print(W)
print(w0)

## Convergence

In [34]:
plt.figure(figsize = (10, 6))
plt.plot(range(1, iteration + 1), objective_values, "k-")
plt.xlabel("Iteration")
plt.ylabel("Error")
plt.show()

## Training Performance

In [35]:
# calculate confusion matrix
y_predicted = np.argmax(Y_predicted, axis = 1) + 1
confusion_matrix = pd.crosstab(y_predicted, y, rownames = ['y_pred'], colnames = ['y_truth'])
print(confusion_matrix)

## Visualization

In [36]:
# evaluate discriminant function on a grid
x1_interval = np.linspace(-8, +8, 1201)
x2_interval = np.linspace(-8, +8, 1201)
x1_grid, x2_grid = np.meshgrid(x1_interval, x2_interval)
discriminant_values = np.zeros((len(x1_interval), len(x2_interval), K))
for c in range(K):
    discriminant_values[:,:,c] = W[0, c] * x1_grid + W[1, c] * x2_grid + w0[0, c]

A = discriminant_values[:,:,0]
B = discriminant_values[:,:,1]
C = discriminant_values[:,:,2]
A[(A < B) & (A < C)] = np.nan
B[(B < A) & (B < C)] = np.nan
C[(C < A) & (C < B)] = np.nan
discriminant_values[:,:,0] = A
discriminant_values[:,:,1] = B
discriminant_values[:,:,2] = C

plt.figure(figsize = (10, 10))
plt.plot(X[y == 1, 0], X[y  == 1, 1], "r.", markersize = 10)
plt.plot(X[y  == 2, 0], X[y  == 2, 1], "g.", markersize = 10)
plt.plot(X[y  == 3, 0], X[y  == 3, 1], "b.", markersize = 10)
plt.plot(X[y_predicted != y , 0], X[y_predicted != y , 1], "ko", markersize = 12, fillstyle = "none")
plt.contour(x1_grid, x2_grid, discriminant_values[:,:,0] - discriminant_values[:,:,1], levels = 0, colors = "k")
plt.contour(x1_grid, x2_grid, discriminant_values[:,:,0] - discriminant_values[:,:,2], levels = 0, colors = "k")
plt.contour(x1_grid, x2_grid, discriminant_values[:,:,1] - discriminant_values[:,:,2], levels = 0, colors = "k")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.show()