<a href="https://colab.research.google.com/github/Romulan12/ML-algorithms/blob/master/Linear_Discriminant_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import math
import numpy as np
import sys

def read_training_file(fname):
    """This function reads the training file line by line and creats the required lists, that is,
the list of features for each class"""
    temp_listx1 = []
    temp_listx2 = []
    temp_listx3 = []
    total_len = c1 = c2 = c3 = 0    
    with open(fname) as fl:
        for line in fl:
          try:
            total_len+=1
            seg = line.strip().split(',')            
            if (seg[4] == 'Iris-setosa'):
                c1+=1
                temp_listx1.append([float(seg[0]),float(seg[1]),float(seg[2]),float(seg[3])])
            elif (seg[4] == 'Iris-versicolor'):
                c2+=1
                temp_listx2.append([float(seg[0]),float(seg[1]),float(seg[2]),float(seg[3])])
            else:
                c3+=1
                temp_listx3.append([float(seg[0]),float(seg[1]),float(seg[2]),float(seg[3])])
          except IndexError:
            pass
        return (temp_listx1,temp_listx2,temp_listx3,c1,c2,c3,total_len)

def read_testing_file(fname):
    """To read the validation file and return the set of lines in the file"""
    return [line.strip().split(',') for line in open(fname).readlines()]
     
def lda(l,x1,x2,x3,c1,c2,c3,total_len):
    """This function implements the lda"""
    t = [0,0,0,0]
    x = [x1,x2,x3]
    cm = 0    
    class_mean = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]]
    """Find the mean of each feature in the feature vector for each class"""
    for a in x:
        summ = [0.0,0.0,0.0,0.0]
        avg = [0.0,0.0,0.0,0.0]
        c = 0
        for j in a:
            c += 1
            for k in range(4):
                summ[k] = summ[k] + j[k]
        for i in range(4):
            avg[i] = summ[i]/float(c)
        class_mean[cm] = avg
        cm += 1
    mean1 = np.matrix(class_mean[0])
    mean2 = np.matrix(class_mean[1])
    mean3 = np.matrix(class_mean[2])
    
    
    """Mean of all the features using the mean of the features of each class"""
    for i in range(4):
        for j in range(3):
            t[i] += (class_mean[j][i])
        t[i] /= 3.0
    mean = np.matrix(t)

    """Computing difference of mean and each feature vector for class 1"""
    c1_diff = []
    for i in x1:
        t = [0,0,0,0]
        for j in range(4):
            t[j] = i[j] - mean1.item(j)
        c1_diff.append(t)
    c1_diff_mat = np.matrix(c1_diff)
    
    """Computing difference of mean and each feature vector for class 1"""
    c2_diff = []
    for i in x2:
        t = [0,0,0,0]
        for j in range(4):
            t[j] = i[j] - mean2.item(j)
        c2_diff.append(t)
    c2_diff_mat = np.matrix(c2_diff)
    
    """Computing difference of mean and each feature vector for class 1"""
    c3_diff = []
    for i in x3:
        t = [0,0,0,0]
        for j in range(4):
            t[j] = i[j] - mean3.item(j)
        c3_diff.append(t)
    c3_diff_mat = np.matrix(c3_diff)
    
    """Computing the individual covariance matrix"""
    cc1 = c1_diff_mat.T * c1_diff_mat * (1.0/c1)
    cc2 = c2_diff_mat.T * c2_diff_mat * (1.0/c2)
    cc3 = c3_diff_mat.T * c3_diff_mat * (1.0/c3)

    """Computing the final covariance matrix"""
    c = 0
    t = []
    cov = []
    for i in range(0,16):
        c += 1
        t.append(((float(c1)/total_len)*cc1.item(i)+(float(c2)/total_len)*cc2.item(i)+(float(c3)/total_len)*cc3.item(i)))
        if c == 4:
            cov.append(t)
            t = []
            c = 0
    cov_mat = np.matrix(cov)
    
    """Inverse of the covariance matrix"""
    cov_mat_inv = cov_mat.I
    
    """Create a matrix with the values of the feature vector to be classified"""
    ip = []
    ip.append([float(l[0]),float(l[1]),float(l[2]),float(l[3])])
    ip_mat = np.matrix(ip)

    """Now we use the discriminant function to calculate the probability of P(x|y)"""
    
    a1 = (ip_mat - mean1) * cov_mat_inv * ((ip_mat.T - mean1.T))
    a2 = (ip_mat - mean2) * cov_mat_inv * ((ip_mat.T - mean2.T))
    a3 = (ip_mat - mean3) * cov_mat_inv * ((ip_mat.T - mean3.T))
    
    cons1 = 1.0 / (math.pow(2*(22.0/7.0),c1/2) * math.pow(np.linalg.det(cov_mat),0.5))
    cons2 = 1.0 / (math.pow(2*(22.0/7.0),c2/2) * math.pow(np.linalg.det(cov_mat),0.5))
    cons3 = 1.0 / (math.pow(2*(22.0/7.0),c3/2) * math.pow(np.linalg.det(cov_mat),0.5))

    prob1 = (float(c1)/total_len) * cons1 * math.exp(-0.5 * a1.item(0))
    prob2 = (float(c2)/total_len) * cons2 * math.exp(-0.5 * a2.item(0))
    prob3 = (float(c3)/total_len) * cons3 * math.exp(-0.5 * a3.item(0))    
    
    print (str(math.log(prob1,2)) + "  " + str(math.log(prob2,2)) + "  " + str(math.log(prob3,2)))
    
    if prob1 > prob2 and prob1 > prob3:
        return 'Iris-setosa'
    if prob2 > prob1 and prob2 > prob3:
        return 'Iris-versicolor'
    else:
        return 'Iris-virginica'
    
def main():
        correct = 0
        training_file = '/content/iris.csv'
        (listx1,listx2,listx3,c1,c2,c3,total_len) = read_training_file(training_file)
        lines = read_testing_file(training_file)
        print ("The log to the base 2 probabilities of class 1, 2 and 3")
        for l in lines:
          if lda(l,listx1,listx2,listx3,c1,c2,c3,total_len) == l[4]:
            correct += 1

        print ("Classified %d correctly out of %d for a accuracy of %f" % (correct, len(lines), float(correct)/len(lines)))

if __name__ == '__main__':
	main()

The log to the base 2 probabilities of class 1, 2 and 3
-60.87040706699516  -133.7544278350542  -202.4897779922196
-62.07737237560326  -120.44671947027635  -185.65230552863667
-61.0298236064433  -125.16841074549481  -191.5712237038119
-62.13254700240819  -116.3560757306942  -179.54981677032922
-61.12086525159814  -135.30789566780913  -204.08702651420782
-62.127062960944734  -131.57153577049905  -196.11682020593602
-61.64127813833224  -122.78469258908787  -186.46485521420058
-60.72529714723885  -126.85748025260686  -193.49342401258048
-63.407855070638895  -113.61737071776622  -175.94129552748853
-61.63844218952955  -122.87115668727007  -189.6573642900222
-61.85502495176622  -139.8958624342162  -209.97400891452844
-61.687130135444065  -122.37048522627298  -186.9508032782881
-61.89553614670334  -122.70285447790177  -189.6089867264427
-62.345905845033904  -126.92886825429422  -194.79909660113537
-68.5408867601354  -167.98004086389076  -244.82938662901245
-68.10937027068853  -159.5377618830