Name: Ethan Paek

Date: 4/22/2020

Topic: COEN 140 Lab 4

Description: Use Linear Discriminant Analysis (LDA) and Quadratic Discriminant Analysis (QDA) over the provided dataset.The dataset can be downloaded at http://www.cse.scu.edu/~yfang/coen140/iris.data

In [1]:
import numpy as np
import pandas as pd
import math
import pprint
import time

### Step 1: Import data and breakup into training and testing subsets

In [2]:
# load and store dataset from website
data = pd.io.parsers.read_csv(
    filepath_or_buffer='http://www.cse.scu.edu/~yfang/coen140/iris.data',
    header=None,
    sep=',',
    )
print(data)
print(type(data[0][0]))

       0    1    2    3               4
0    5.1  3.5  1.4  0.2     Iris-setosa
1    4.9  3.0  1.4  0.2     Iris-setosa
2    4.7  3.2  1.3  0.2     Iris-setosa
3    4.6  3.1  1.5  0.2     Iris-setosa
4    5.0  3.6  1.4  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.7  3.0  5.2  2.3  Iris-virginica
146  6.3  2.5  5.0  1.9  Iris-virginica
147  6.5  3.0  5.2  2.0  Iris-virginica
148  6.2  3.4  5.4  2.3  Iris-virginica
149  5.9  3.0  5.1  1.8  Iris-virginica

[150 rows x 5 columns]
<class 'numpy.float64'>


In [3]:
# split up 80% of dataset for training and 20% for testing
train = data[0:40].append(data[50:90]).append(data[100:140])
test  = data[40:50].append(data[90:100]).append(data[140:150])
print(train.shape)
print(test.shape)

(120, 5)
(30, 5)


In [4]:
# categorize our flowers by their respective class and create separate tables for them
setosa = train[train[4] == 'Iris-setosa']
print(setosa.tail(),'\n')
versicolor = train[train[4] == 'Iris-versicolor']
print(versicolor.tail(),'\n')
virginica = train[train[4] == 'Iris-virginica']
print(virginica.tail())

      0    1    2    3            4
35  5.0  3.2  1.2  0.2  Iris-setosa
36  5.5  3.5  1.3  0.2  Iris-setosa
37  4.9  3.1  1.5  0.1  Iris-setosa
38  4.4  3.0  1.3  0.2  Iris-setosa
39  5.1  3.4  1.5  0.2  Iris-setosa 

      0    1    2    3                4
85  6.0  3.4  4.5  1.6  Iris-versicolor
86  6.7  3.1  4.7  1.5  Iris-versicolor
87  6.3  2.3  4.4  1.3  Iris-versicolor
88  5.6  3.0  4.1  1.3  Iris-versicolor
89  5.5  2.5  4.0  1.3  Iris-versicolor 

       0    1    2    3               4
135  7.7  3.0  6.1  2.3  Iris-virginica
136  6.3  3.4  5.6  2.4  Iris-virginica
137  6.4  3.1  5.5  1.8  Iris-virginica
138  6.0  3.0  4.8  1.8  Iris-virginica
139  6.9  3.1  5.4  2.1  Iris-virginica


In [5]:
# drop labels so that we only have numerical data for training sets
train_setosa = setosa.drop(4,axis=1)
print('setosa:')
print(train_setosa.head(),'\n')
train_versicolor = versicolor.drop(4,axis=1)
print('versicolor:')
print(train_versicolor.head(),'\n')
train_virginica = virginica.drop(4,axis=1)
print('virginica:')
print(train_virginica.head(),'\n')

setosa:
     0    1    2    3
0  5.1  3.5  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  1.3  0.2
3  4.6  3.1  1.5  0.2
4  5.0  3.6  1.4  0.2 

versicolor:
      0    1    2    3
50  7.0  3.2  4.7  1.4
51  6.4  3.2  4.5  1.5
52  6.9  3.1  4.9  1.5
53  5.5  2.3  4.0  1.3
54  6.5  2.8  4.6  1.5 

virginica:
       0    1    2    3
100  6.3  3.3  6.0  2.5
101  5.8  2.7  5.1  1.9
102  7.1  3.0  5.9  2.1
103  6.3  2.9  5.6  1.8
104  6.5  3.0  5.8  2.2 



In [23]:
# define the probability density function (as noted from Dr. Fang's lectures)
def P(x,mean,covariance):
    k = x.shape[0]
    first = 1/math.sqrt(((2.0*math.pi)**k) * np.linalg.det(covariance)) 
    second = math.exp(-0.5*np.dot(np.dot((x-mean),np.linalg.inv(covariance)),(x-mean)[np.newaxis].T))
    return first*second

In [24]:
# in order to use the function from above, we have to calculate the mean for each of the flowers' attributes
def my_means(matrix):
    means = []
    for attribute in matrix.values.T:
        means.append(attribute.sum()/float(matrix.shape[0]))
    return np.array(means)

In [25]:
# we also need to calculate the covariances
def my_covs(matrix):
    mean = my_means(matrix)
    # setup each class to have a 4 x 4 matrix for covariance
    total_cov = np.zeros((matrix.shape[1],matrix.shape[1]))
    for row in range(matrix.shape[0]):
        total_cov += np.outer((matrix.iloc[row].values - mean),(matrix.iloc[row].values - mean))
    cov = total_cov/float(matrix.shape[0])
    return cov

In [26]:
means = {}
means['setosa']  = my_means(train_setosa)
means['versicolor'] = my_means(train_versicolor)
means['virginica'] = my_means(train_virginica)

covs = {}
covs['setosa'] = my_covs(train_setosa)
covs['versicolor'] = my_covs(train_versicolor)
covs['virginica'] = my_covs(train_virginica)

# this variable is necessary for LDA, since the covariances are assumed to be equal
cov_avg = (covs['setosa'] + covs['versicolor'] + covs['virginica'])/3.0

# pprint is a module that prints out dictionaries all pretty :D
print("Means:")
pprint.pprint(means)
print("\nCovariances:")
pprint.pprint(covs)

Means:
{'setosa': array([5.0375, 3.44  , 1.4625, 0.2325]),
 'versicolor': array([6.01  , 2.78  , 4.3175, 1.35  ]),
 'virginica': array([6.6225, 2.96  , 5.6075, 1.99  ])}

Covariances:
{'setosa': array([[0.12784375, 0.0965    , 0.01265625, 0.01328125],
       [0.0965    , 0.1294    , 0.002     , 0.0142    ],
       [0.01265625, 0.002     , 0.02884375, 0.00446875],
       [0.01328125, 0.0142    , 0.00446875, 0.00969375]]),
 'versicolor': array([[0.2669    , 0.08445   , 0.167825  , 0.051     ],
       [0.08445   , 0.1081    , 0.07885   , 0.04425   ],
       [0.167825  , 0.07885   , 0.19844375, 0.071875  ],
       [0.051     , 0.04425   , 0.071875  , 0.042     ]]),
 'virginica': array([[0.45624375, 0.10765   , 0.34883125, 0.049975  ],
       [0.10765   , 0.1104    , 0.07905   , 0.0451    ],
       [0.34883125, 0.07905   , 0.33669375, 0.057825  ],
       [0.049975  , 0.0451    , 0.057825  , 0.0724    ]])}


### Step 2: Build an LDA classifier based on the training data. Report the training and test errors for your classifier

In [27]:
def LDA(x, mean, avg_cov):
    prob = {}
    prob['Iris-setosa'] = P(x,mean['setosa'],avg_cov)
    prob['Iris-versicolor'] = P(x,mean['versicolor'],avg_cov)
    prob['Iris-virginica'] = P(x,mean['virginica'],avg_cov)
    
    # key is set to prob.get since we need to compare the values of the dictionary, not the keys
    return max(prob, key=prob.get)

In [28]:
# Calculate the accuracy by comparing LDA/QDA prediction vs actual value
def calculate_accuracy(classifier, subset):
    n_correct = 0
    for row in subset.iterrows():
        x = np.array(row[1][0:4])
        actual = row[1][4]
        # classifier = 1: LDA, classifier = 2: QDA, classifier = 3: QDA with independent features
        if classifier == 1:
            if LDA(x, means, cov_avg) == actual:
                n_correct += 1
        elif classifier == 2:
            if QDA(x, means, covs) == actual:
                n_correct += 1
        elif classifier == 3:
            if QDA(x, means, indep_covs) == actual:
                n_correct += 1
        else:
            raise ValueError("Classifier unknown. Please try again")
    accuracy = (n_correct/float(len(subset)) * 100)
    error = 100 - accuracy
    return str(error)
print("Error rate for LDA on training subset: " + calculate_accuracy(1, train) + "%")
print("Error rate for LDA on testing subset: " + calculate_accuracy(1, test) + "%")

Error rate for LDA on training subset: 2.5%
Error rate for LDA on testing subset: 0.0%


### Step 3: Build a QDA classifier based on the training data. Report the training and test errors for your classifier

In [29]:
def QDA(x, mean, covs):
    prob = {}
    prob['Iris-setosa'] = P(x,mean['setosa'],covs['setosa'])
    prob['Iris-versicolor'] = P(x,mean['versicolor'],covs['versicolor'])
    prob['Iris-virginica'] = P(x,mean['virginica'],covs['virginica'])
    
    # key is set to prob.get since we need to compare the values of the dictionary, not the keys
    return max(prob, key=prob.get)

In [30]:
print("Error rate for QDA on training subset: " + calculate_accuracy(2, train) + "%")
print("Error rate for QDA on testing subset: " + calculate_accuracy(2, test) + "%")

Error rate for QDA on training subset: 1.6666666666666714%
Error rate for QDA on testing subset: 0.0%


### Step 4: Is there any class linearly separable from other classes? Explain your answer based on your experiments

In [31]:
# do training & testing sets together, rather than separated
categories = ['Iris-setosa','Iris-versicolor','Iris-virginica']

# run LDA on each separate class
for category in categories:
    flower_class = data[data[4] == category]
    n_correct = 0
    for row in flower_class.iterrows():
        x = np.array(row[1][0:4])
        actual = row[1][4]
        predicted = LDA(x,means,cov_avg)
        # if predicted answer matches our actual answer, we consider that a success
        if predicted == actual:
            n_correct += 1
    accuracy = (n_correct/float(len(flower_class)) * 100)
    error = 100 - accuracy
    print(category, "error rate:", error, "%")

Iris-setosa error rate: 0.0 %
Iris-versicolor error rate: 4.0 %
Iris-virginica error rate: 2.0 %


From these results, it's clear that 'Iris-setosa' is linearly separable from the other classes since it achieved perfect classification from LDA whereas the other classes did not.

### Step 5:  Assume the features are independent, i.e., ∑ is a diagonal matrix. Repeat Step 3, and report your results. Also, please report the training time of this method and the original QDA that you implemented in Step 3.

In [32]:
#convert cov matrices to diagonal (set non-diag entries to 0)
indep_covs = {}
for category, cov in covs.items():
    
    # setup each category to have a 4 x 4 identity matrix
    indep_covs[category] = np.zeros(cov.shape)
    
    # we should only add the diagonal values from our covariance matrices to our identity matrix
    for row in range(cov.shape[0]):
        for col in range(cov.shape[1]):
            if row == col:
                indep_covs[category][row][col] = cov[row][col]
    print(indep_covs[category])

[[0.12784375 0.         0.         0.        ]
 [0.         0.1294     0.         0.        ]
 [0.         0.         0.02884375 0.        ]
 [0.         0.         0.         0.00969375]]
[[0.2669     0.         0.         0.        ]
 [0.         0.1081     0.         0.        ]
 [0.         0.         0.19844375 0.        ]
 [0.         0.         0.         0.042     ]]
[[0.45624375 0.         0.         0.        ]
 [0.         0.1104     0.         0.        ]
 [0.         0.         0.33669375 0.        ]
 [0.         0.         0.         0.0724    ]]


In [33]:
# calculate the time and error rates for QDA (like in Step 3)
qda_start_time = time.time() * 1000
print("Error rate for QDA on training subset: " + calculate_accuracy(2, train) + "%")
print("Error rate for QDA on testing subset: " + calculate_accuracy(2, test) + "%")
print("Time taken for QDA with independent features:", (time.time()*1000)-qda_start_time, "ms\n")

# calculate the time and error rates for QDA with independent features
indep_start_time = time.time() * 1000
print("Error rate for QDA on training subset with independent features : " + calculate_accuracy(3, train) + "%")
print("Error rate for QDA on testing subset with independent features : " + calculate_accuracy(3, test) + "%")
print("Time taken for QDA with independent features:", (time.time()*1000)-indep_start_time, "ms")

Error rate for QDA on training subset: 1.6666666666666714%
Error rate for QDA on testing subset: 0.0%
Time taken for QDA with independent features: 61.92822265625 ms

Error rate for QDA on training subset with independent features : 4.166666666666657%
Error rate for QDA on testing subset with independent features : 0.0%
Time taken for QDA with independent features: 56.9677734375 ms


From these results, it's clear that by assuming the features are independent, the computation time is much shorter compared to the original QDA function.