In [17]:
# Onur Can 
# Project is done for Prof. Mehmet Gönen's DASC 521: Introduction to Machine Learning @ Koç University MSc Data Science Program
# Thanks Prof Mehmet for the dataset generation and instructions

#importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import math as math
import pandas as pd


## Parameters for synthetic data

In [18]:
#Parameters for synthetic data
np.random.seed(666)
#mean vector parameters
class_means = np.array([[0.0, 2.5],[-2.5 , -2.0], [+2.5 , -2.0]])
#class covariance matrices
class_covariance_matrix = np.array([[[3.2 , 0.0],[0.0 , 1.2]],[[1.2 , 0.8],[0.8 , 1.2]],[[
    1.2 , -0.8],[-0.8 , 1.2]]])
#class sizes
class_size = np.array([120 , 80 , 100])
d = class_means.shape[1]
print(class_means, class_means.shape)
print(class_covariance_matrix, class_covariance_matrix.shape)
print(class_size)

## Data Generation

In [19]:
# generate random samples
point1 = np.random.multivariate_normal(class_means[0], class_covariance_matrix[0], class_size[0]) #R
point2 = np.random.multivariate_normal(class_means[1], class_covariance_matrix[1], class_size[1]) #G
point3 = np.random.multivariate_normal(class_means[2], class_covariance_matrix[2], class_size[2]) #B
points = np.concatenate((point1, point2, point3))

# generate corresponding labels
y = np.concatenate((np.repeat(1, class_size[0]), np.repeat(2, class_size[1]), np.repeat(3, class_size[2])))

## Initial Plotting

In [20]:
plt.figure(figsize = (10,6))
# plot data points of the first(1) class
plt.plot(point1[:,0],point1[:,1],"r.", markersize = 10)
# plot data points of the second(2) class
plt.plot(point2[:,0],point2[:,1],"g.", markersize = 10)
# plot data points of the third(3) class
plt.plot(point3[:,0],point3[:,1],"b.", markersize = 10)
plt.xlabel("x_1")
plt.ylabel("x_2")
plt.grid()
plt.show()

## Parameter Estimation

$\widehat{\mu_{c}} = \dfrac{\sum\limits_{i = 1}^{N} x_{i} \mathbb{1}(y_{i} = c)}{\sum\limits_{i = 1}^{N} \mathbb{1}(y_{i} = c)}$

In [21]:
#Calculating sample means
K = np.max(y)
#x1 features column for y = 1,2,3
x1 = points[:, 0] 
#x2 features column for y = 1,2,3
x2 = points[:, 1]
#Calculating means for x1 & x2 for y = 1,2,3
sample_means_x1_feature = [np.mean(x1[y == (c+1)]) for c in range(K)]
sample_means_x2_feature = [np.mean(x2[y == (c+1)]) for c in range(K)]
#creating the mean vector and taking transpose to get right format
sample_mean = np.stack([sample_means_x1_feature,sample_means_x2_feature])
sample_mean = np.transpose(sample_mean)
mean1 = sample_mean[0,:]
mean2 = sample_mean[1,:]
mean3 = sample_mean[2,:]
print(sample_mean)

In [22]:
#Calculating Covariance Matrix
#Sigma i=1,j=1 calculated via using labels just to show a different way other than np.cov
Label_1_Datapoints = (points[(y == 1)])
e1_i1j1 = np.sum((Label_1_Datapoints[:,0] - sample_mean[0][0])**2)/(Label_1_Datapoints.shape[0]-1)
#Sigma i=1,j=2 calculated via using labels
e1_i1j2 = np.sum((Label_1_Datapoints[:,0] - sample_mean[0][0])*(Label_1_Datapoints[:,1] - 
                                                                sample_mean[0][1]))/(Label_1_Datapoints.shape[0]-1)
print("x1^2 = ", e1_i1j1," x1^2 = ", e1_i1j2,"\n")

#Rest of the covariance matrices.
cov1 = np.cov(point1[:,0],point1[:,1])
cov2 = np.cov(point2[:,0],point2[:,1])
cov3 = np.cov(point3[:,0],point3[:,1])
sample_covariance_matrix = np.stack([cov1, cov2 , cov3])
print(sample_covariance_matrix)

In [23]:
#Calculating prior probabilities
class_priors = [np.mean(y == c+1) for c in range(K)]
print(class_priors)

## Parametric Classification

In [24]:
data_interval = points
# We know that density function has following quadratic form for g(x)
# ------------Application of each Class----------------------------------
#g(x) = x^T.Wi.x + wi^T.x + wi0 where;
# Wi = -0.5 * Cov_i^(-1)       wi = Cov_i^(-1) * mean_i     w0 = 0.5* transpose.mean_i * Cov_i^(-1) * mean_i + CONSTANTS

score_values = np.zeros((300,3))
for i in range (0,300,1):
    data_interval = points[i]
    
    # g1 score
    g1 = (-0.5 * d * np.log( 2 * math.pi)) + (-0.5*(np.log(np.linalg.det(sample_covariance_matrix[0])))) + ( -0.5 * (((   
    np.matmul(np.matmul(data_interval, np.linalg.inv(sample_covariance_matrix[0])),np.transpose(data_interval)))) - (    
    2 * (np.matmul(np.matmul(data_interval, np.linalg.inv(sample_covariance_matrix[0])),np.transpose(sample_mean[0])))) + (    
    (np.matmul(np.matmul(sample_mean[0], np.linalg.inv(sample_covariance_matrix[0])),np.transpose(sample_mean[0])))))) + (
    np.log(class_priors[0]))
    
    #g2 score
    g2 = (-0.5 * d * np.log( 2 * math.pi)) + (-0.5*(np.log(np.linalg.det(sample_covariance_matrix[1])))) + ( -0.5 * (((   
    np.matmul(np.matmul(data_interval, np.linalg.inv(sample_covariance_matrix[1])),np.transpose(data_interval)))) - (    
    2 * (np.matmul(np.matmul(data_interval, np.linalg.inv(sample_covariance_matrix[1])),np.transpose(sample_mean[1])))) + (    
    (np.matmul(np.matmul(sample_mean[1], np.linalg.inv(sample_covariance_matrix[1])),np.transpose(sample_mean[1])))))) + (
    np.log(class_priors[1]))
    
    #g3 score
    g3 = (-0.5 * d * np.log( 2 * math.pi)) + (-0.5*(np.log(np.linalg.det(sample_covariance_matrix[2])))) + ( -0.5 * (((   
    np.matmul(np.matmul(data_interval, np.linalg.inv(sample_covariance_matrix[2])),np.transpose(data_interval)))) - (    
    2 * (np.matmul(np.matmul(data_interval, np.linalg.inv(sample_covariance_matrix[2])),np.transpose(sample_mean[2])))) + (    
    (np.matmul(np.matmul(sample_mean[2], np.linalg.inv(sample_covariance_matrix[2])),np.transpose(sample_mean[2])))))) + (
    np.log(class_priors[2]))
    
    score_values[i][0] = g1
    score_values[i][1] = g2
    score_values[i][2] = g3

### Data Comparison for False Predictions

In [25]:
#training data labels
predictions = np.zeros((1,300))
g_max = 0.
#Logical indexing for false predictions
for t in range (0,300):
    g_max = np.max(score_values[t])
    location, = np.where(score_values[t] == g_max)
    predictions[0][t] = location[0] + 1

comparison= np.array(y != predictions)

## Confusion Matrix

In [26]:
#Confusion matrix
y_predicted = np.reshape(predictions, (300,)).astype(int)
print(y_predicted.shape,y.shape)
confusion_matrix = pd.crosstab(y_predicted, y, rownames = ['y_pred'], colnames = ['y_truth'])    #PANDAS
print(confusion_matrix)

## Final Plotting

In [27]:
#taking wrong guesses into array format for plotting
wrong_point_count = 0
for l in range(0,300):
    if comparison[0][l] == True:
        wrong_point_count = wrong_point_count + 1
print("wrong point count ", wrong_point_count)
x1_false = np.zeros(4,)
x2_false = np.zeros(4,)
placement = 0
for l in range(0,300):
    if comparison[0][l] == True:
        false_vector = points[l]
        x1_false[placement] = false_vector[0]
        x2_false[placement] = false_vector[1]
        placement = placement + 1
print(x1_false,x2_false)

#final plotting the outcomes.
plt.figure(figsize = (10,6))
plt.plot(point1[:,0],point1[:,1],"r.", markersize = 10)
plt.plot(point2[:,0],point2[:,1],"g.", markersize = 10)
plt.plot(point3[:,0],point3[:,1],"b.", markersize = 10)
plt.scatter(x1_false,x2_false, s=180, facecolors="none", edgecolors='r')
plt.title("Circled Values are miss-estimated by the Algorithm")
plt.xlabel("x_1")
plt.ylabel("x_2")
plt.grid()
plt.show()