# GDA Implementation.

Implement the Gaussian Discriminant Analysis (GDA) learning algorithm following the steps as discussed in class.

INSTRUCTION: Rename your notebook as: <br>
`firstName_LastName_Live_coding_GDA.ipynb`.

Notes: 
* Do not use any built-in functions to complete a task;
* Do not import additional libraries.

In [121]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [122]:
# Generate data
def generate_data():
  x, y = make_classification(n_samples= 1000, n_features=3, n_redundant=0, 
                           n_informative=3, random_state=1, 
                           n_clusters_per_class=1)
  
  return x,y

x,y= generate_data()
print(x.shape, y.shape)

(1000, 3) (1000,)


In [123]:
# To check the number of classes:

np.unique(y)

# Outcome: array([0, 1]), which means that we have two classes: 0 and 1

array([0, 1])

In [124]:
def split_data(x,y, train_size= 0.8):
    # shuffle the data to randomize the train/test split
    n = x.shape[0]
    m = int(n*train_size)
    idx = np.arange(n)
    
    np.random.shuffle(idx)
    x1 = x[idx]
    y1 = y[idx]

    x_train = x1[:m]
    y_train = y1[:m]
    x_test = x1[m:]
    y_test = y1[m:]

    return x_train, x_test, y_train, y_test


In [125]:
X_train, X_test, y_train, y_test= split_data(x,y, train_size= 0.8) # split your data into x_train, x_test, y_train, y_test
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(800, 3) (800,) (200, 3) (200,)


In [126]:
def covariance(x, mu):
  
  # Remember, the covariance matrix is symmetric!

  N,D = x.shape
  cov_mat = np.zeros((D,D)) #initializing the covariane matrix
  for i in range (D):
    for j in range (D):
      cov_mat[i,j] = np.sum((x[:, i] - mu[i]) * (x[:, j] - mu[j])) / (N - 1)
  return cov_mat

  # Easy way: cov= np.cov(x, rowvar=0) but do not use it. One can use it to assess his/her result.
  
  

In [127]:
# To check the covariance funcion : 
# We check the results obtained from the ecovarinace function with the results obtained from np.cov(x,rowvar = 0)
covariance_x = covariance(x, x.mean(0))
covariance_x 

array([[1.84495325, 0.02790646, 1.00137533],
       [0.02790646, 1.00170721, 0.05539176],
       [1.00137533, 0.05539176, 1.74832   ]])

In [128]:
cov_x=np.cov(x, rowvar=0)
cov_x


array([[1.84495325, 0.02790646, 1.00137533],
       [0.02790646, 1.00170721, 0.05539176],
       [1.00137533, 0.05539176, 1.74832   ]])

In [129]:
# The results are identical!  :)

In [130]:
class GDA:
  def __init__(self):
    ## set mu, phi and sigma to None
    self.mu = None
    self.phi = None
    self.sigma = None

  def fit(self,x,y):
    k = len(np.unique(y)) # Number of class.
    d = x.shape[1]  # input dim
    m = x.shape[0] # Number of examples.
    
    ## Initialize mu, phi and sigma
    self.mu= np.zeros((k,d))#: kxd, i.e., each row contains an individual class mu.
    self.sigma= np.zeros((k,d,d))#: kxdxd, i.e., each row contains an individual class sigma.
    self.phi= np.zeros(k)# k-dimension

    ## START THE LEARNING: estimate mu, phi and sigma.
    for label in range(k):
      boo_lean = (label==y)
      self.phi[label] = np.sum(boo_lean)/m
      self.mu[label] = np.mean(x[boo_lean], axis=0)
      self.sigma[label] = covariance(x[boo_lean], self.mu[label])


  def predict_proba(self,x):
    # reshape or flatt x.
    d = x.shape[1]

    k_class= self.mu.shape[0] # Number of classes we have in our case it's k = 2

    ## START THE LEARNING: estimate mu, phi and sigma.
    prob = np.zeros((x.shape[0],k_class))
    for lab in range(k_class):
      for i in range(x.shape[0]):
        p_y = self.phi[lab]

        prob[i,lab]=1/((2*np.pi)**(d/2)*np.linalg.det(self.sigma[lab])**0.5)*np.exp(-0.5*(x[i]-self.mu[lab]).T@np.linalg.inv(self.sigma[lab])@(x[i]-self.mu[lab]))*p_y
        
    return prob

  def predict(self,x):
    proba_ = self.predict_proba(x)
    y_predict = np.argmax(proba_, axis=1)
    return y_predict
  
  def accuracy(self, y, ypreds):
    return (np.mean(y==ypreds)*100)

In [131]:
model= GDA()
model.fit(X_train,y_train)
model.sigma

array([[[ 0.86658507, -0.36399935, -0.0563417 ],
        [-0.36399935,  1.73369787,  0.10004051],
        [-0.0563417 ,  0.10004051,  0.03888383]],

       [[ 0.80170472,  0.36519938,  0.10254372],
        [ 0.36519938,  0.34185052, -0.06574212],
        [ 0.10254372, -0.06574212,  1.58288571]]])

In [132]:
yproba= model.predict_proba(X_test)
yproba

array([[6.95153922e-005, 4.96819640e-003],
       [7.44155967e-002, 3.32280264e-003],
       [2.86835340e-022, 6.06898624e-002],
       [4.55898676e-002, 1.20629550e-003],
       [7.13260411e-032, 3.57794480e-002],
       [1.07067857e-001, 4.35974086e-006],
       [5.70637700e-013, 4.05417512e-002],
       [7.45963415e-034, 3.16852834e-002],
       [5.26583338e-026, 5.28834831e-002],
       [5.92080154e-002, 4.74215981e-005],
       [2.01030950e-005, 1.13260260e-003],
       [5.71904756e-002, 3.54252734e-012],
       [7.06995984e-003, 8.77731075e-003],
       [3.65473726e-034, 3.98251222e-002],
       [6.72229489e-002, 5.85636271e-005],
       [4.10027810e-002, 2.02154051e-011],
       [1.45231830e-025, 4.98449696e-002],
       [7.31272830e-002, 5.10955435e-009],
       [2.75751233e-023, 3.72284924e-002],
       [2.56990782e-004, 2.17547739e-002],
       [9.78986152e-021, 1.18246653e-002],
       [3.74039443e-048, 7.13301926e-003],
       [4.05023331e-002, 1.17483149e-005],
       [5.0

In [133]:
ypreds= model.predict(X_test)
ypreds


array([1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0])

In [134]:
model.accuracy(y_test, ypreds)

98.0