# PCA and LDA techniques on the Iris Dataset

In [7]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.7.1-cp311-cp311-win_amd64.whl (7.6 MB)
     ---------------------------------------- 0.0/7.6 MB ? eta -:--:--
     - -------------------------------------- 0.2/7.6 MB 7.6 MB/s eta 0:00:01
     ------ --------------------------------- 1.1/7.6 MB 14.6 MB/s eta 0:00:01
     ------------ --------------------------- 2.3/7.6 MB 21.2 MB/s eta 0:00:01
     ------------------------- -------------- 4.9/7.6 MB 28.2 MB/s eta 0:00:01
     --------------------------------- ------ 6.4/7.6 MB 31.4 MB/s eta 0:00:01
     ------------------------------------- -- 7.1/7.6 MB 26.5 MB/s eta 0:00:01
     ---------------------------------------  7.6/7.6 MB 25.6 MB/s eta 0:00:01
     ---------------------------------------- 7.6/7.6 MB 22.1 MB/s eta 0:00:00
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.7-cp311-cp311-win_amd64.whl (162 kB)
     ---------------------------------------- 0.0/163.0 kB ? eta -:--:--
     -------------------------------------- 

In [10]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
     ---------------------------------------- 0.0/293.3 kB ? eta -:--:--
     ------------------------------ ------- 235.5/293.3 kB 7.3 MB/s eta 0:00:01
     -------------------------------------- 293.3/293.3 kB 4.5 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install plotly

Collecting plotly
  Downloading plotly-5.14.0-py2.py3-none-any.whl (15.3 MB)
     ---------------------------------------- 0.0/15.3 MB ? eta -:--:--
     - -------------------------------------- 0.4/15.3 MB 13.4 MB/s eta 0:00:02
     ---- ----------------------------------- 1.7/15.3 MB 17.9 MB/s eta 0:00:01
     ---------- ----------------------------- 3.9/15.3 MB 27.9 MB/s eta 0:00:01
     ----------------- ---------------------- 6.8/15.3 MB 35.9 MB/s eta 0:00:01
     ------------------------ --------------- 9.2/15.3 MB 42.1 MB/s eta 0:00:01
     ------------------------------ -------- 12.0/15.3 MB 59.5 MB/s eta 0:00:01
     ------------------------------------- - 15.0/15.3 MB 59.5 MB/s eta 0:00:01
     --------------------------------------  15.3/15.3 MB 59.5 MB/s eta 0:00:01
     --------------------------------------- 15.3/15.3 MB 40.9 MB/s eta 0:00:00
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Su

In [13]:
#import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [14]:
#load IRIS dataset

iris = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In the iris dataset, there are 4 attributes and 1 label. The attributes are sepal length, sepal width, petal length, and petal width. The label is the species of the iris flower.

## PCA Analysis on the Iris Dataset

In [15]:
#1. Standardize the data. Data is standardized by subtracting the mean and dividing by the standard deviation. 
#This is done so that all the features to the same level of magnitudes.

std_iris = iris.iloc[:,0:4].apply(lambda x: (x - x.mean()) / x.std())
std_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.897674,1.028611,-1.336794,-1.308593
1,-1.1392,-0.12454,-1.336794,-1.308593
2,-1.380727,0.33672,-1.39347,-1.308593
3,-1.50149,0.10609,-1.280118,-1.308593
4,-1.018437,1.259242,-1.336794,-1.308593


In [16]:
#2. Calculate the covariance matrix. Covariance matrix is a matrix that shows the covariance between each pair of features.

covariance_matrix = np.cov(std_iris.T)

In [17]:
#3. Calculate the eigenvalues and eigenvectors of the covariance matrix.

eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

In [18]:
#4. sort the eigenvalues in descending order.

eigen_pairs=[]
for i in range(len(eigen_values)):
    eigen_pairs.append((np.abs(eigen_values[i]), eigen_vectors[:,i]))
eigen_pairs.sort(reverse=True)

In [19]:
#5. Select top k eigenvectors.

k=2
eigen_pairs = eigen_pairs[0:k]
eigen_pairs

[(2.9108180837520528,
  array([ 0.52237162, -0.26335492,  0.58125401,  0.56561105])),
 (0.9212209307072246,
  array([-0.37231836, -0.92555649, -0.02109478, -0.06541577]))]

In [20]:
#6. Create a projection matrix.

projection_matrix = np.hstack((eigen_pairs[0][1].reshape(4,1), eigen_pairs[1][1].reshape(4,1)))
projection_matrix

array([[ 0.52237162, -0.37231836],
       [-0.26335492, -0.92555649],
       [ 0.58125401, -0.02109478],
       [ 0.56561105, -0.06541577]])

In [22]:
#7. Transform the data into the new subspace.

transformed_data = std_iris.dot(projection_matrix)
transformed_data.head()

Unnamed: 0,0,1
0,-2.256981,-0.504015
1,-2.079459,0.653216
2,-2.360044,0.317414
3,-2.296504,0.573447
4,-2.380802,-0.672514


In [23]:

#add the species column to the transformed data
transformed_data['species'] = iris['species']
#rename the columns
transformed_data.columns = ['PC1', 'PC2', 'species']
transformed_data.head()

Unnamed: 0,PC1,PC2,species
0,-2.256981,-0.504015,Iris-setosa
1,-2.079459,0.653216,Iris-setosa
2,-2.360044,0.317414,Iris-setosa
3,-2.296504,0.573447,Iris-setosa
4,-2.380802,-0.672514,Iris-setosa


In [24]:
#8. Plot the data in the new subspace.

fig = px.scatter(transformed_data, x="PC1", y="PC2", color="species", title="PCA on IRIS dataset")
fig.show()


In [26]:
def PCA(iris,n_components):
    std_iris = iris.iloc[:,0:4].apply(lambda x: (x - x.mean()) / x.std())
    covariance_matrix = np.cov(std_iris.T)
    eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
    eigen_pairs=[]
    for i in range(len(eigen_values)):
        eigen_pairs.append((np.abs(eigen_values[i]), eigen_vectors[:,i]))
    eigen_pairs.sort(reverse=True)
    eigen_pairs = eigen_pairs[0:n_components]
    #projection matrix for n components
    projection_matrix=np.hstack([eigen_pairs[i][1].reshape(4,1) for i in range(n_components)])
    transformed_data = std_iris.dot(projection_matrix)
    transformed_data['species'] = iris['species']
    transformed_data.columns = ['PC'+str(i+1) for i in range(n_components)] + ['species']
    return transformed_data

In [27]:
#for 2 components
transformed_data = PCA(iris,2)
fig = px.scatter(transformed_data, x="PC1", y="PC2", color="species", title="PCA on IRIS dataset")
fig.show()

In [28]:
#for 3 components
transformed_data = PCA(iris,3)
fig = px.scatter_3d(transformed_data, x="PC1", y="PC2", z="PC3", color="species", title="PCA on IRIS dataset")
fig.show()

In [29]:
#for 4 components
transformed_data = PCA(iris,4)
fig = px.scatter_matrix(transformed_data, dimensions=["PC1", "PC2", "PC3", "PC4"], color="species", title="PCA on IRIS dataset")
fig.show()

## LDA Analysis on the Iris Dataset

In [30]:
#separate the data into features and labels
features = iris.iloc[:,0:4]
labels = iris.iloc[:,4]

In [31]:
#subtract the mean from the features and divide by the standard deviation
std_features = features.apply(lambda x: (x - x.mean()) / x.std())
std_features.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.897674,1.028611,-1.336794,-1.308593
1,-1.1392,-0.12454,-1.336794,-1.308593
2,-1.380727,0.33672,-1.39347,-1.308593
3,-1.50149,0.10609,-1.280118,-1.308593
4,-1.018437,1.259242,-1.336794,-1.308593


In [32]:
mean_overall = std_features.mean()
mean_overall

sepal_length   -5.684342e-16
sepal_width    -6.158037e-16
petal_length    1.894781e-16
petal_width    -1.894781e-16
dtype: float64

In [34]:
#calculate the within class scatter matrix and between class scatter matrix
SW=np.zeros((4,4))
SB=np.zeros((4,4))
for i in labels.unique():
    mean_class = std_features[labels==i].mean()
    SW += np.cov(std_features[labels==i].T)
    SB += len(std_features[labels==i])*(mean_class - mean_overall).values.reshape(4,1).dot((mean_class - mean_overall).values.reshape(1,4))

In [35]:
#calculate the eigenvalues and eigenvectors of the matrix
eigen_values, eigen_vectors = np.linalg.eig(np.linalg.inv(SW).dot(SB))

#sort the eigenvalues in descending order

eigen_pairs=[]
for i in range(len(eigen_values)):
    eigen_pairs.append((np.abs(eigen_values[i]), eigen_vectors[:,i]))
eigen_pairs.sort(reverse=True, key=lambda x: x[0])

In [36]:
#select top k eigenvectors

k=2
eigen_pairs = eigen_pairs[0:k]
eigen_pairs

[(1581.325932186763,
  array([-0.1497757 +0.j, -0.14817298+0.j,  0.85112189+0.j,  0.48083628+0.j])),
 (13.600776328162258,
  array([ 0.0095293 +0.j,  0.32719336+0.j, -0.57482034+0.j,  0.74995684+0.j]))]

In [37]:
#projection matrix for k components

projection_matrix=np.hstack([eigen_pairs[i][1].reshape(4,1) for i in range(k)]).real
projection_matrix

array([[-0.1497757 ,  0.0095293 ],
       [-0.14817298,  0.32719336],
       [ 0.85112189, -0.57482034],
       [ 0.48083628,  0.74995684]])

In [38]:
#transform the data into the new subspace

transformed_data = std_features.dot(projection_matrix)
transformed_data.head()


Unnamed: 0,0,1
0,-1.784956,0.115029
1,-1.577915,-0.264576
2,-1.658325,-0.083378
3,-1.509588,-0.225146
4,-1.801042,0.189339


In [39]:
#add the species column to the transformed data

transformed_data['species'] = iris['species']
#rename the columns

transformed_data.columns = ['LD1', 'LD2', 'species']
transformed_data.head()

Unnamed: 0,LD1,LD2,species
0,-1.784956,0.115029,Iris-setosa
1,-1.577915,-0.264576,Iris-setosa
2,-1.658325,-0.083378,Iris-setosa
3,-1.509588,-0.225146,Iris-setosa
4,-1.801042,0.189339,Iris-setosa


In [41]:
#plot the data in the new subspace

fig = px.scatter(transformed_data, x="LD1", y="LD2", color="species", title="LDA on IRIS dataset")
fig.show()

In [44]:
def LDA(data,n_components=2):
    features = data.iloc[:,0:4]
    labels = data.iloc[:,4]
    std_features = features.apply(lambda x: (x - x.mean()) / x.std())
    mean_overall = std_features.mean()
    SW=np.zeros((4,4))
    SB=np.zeros((4,4))
    for i in labels.unique():
        mean_class = std_features[labels==i].mean()
        SW += np.cov(std_features[labels==i].T)
        SB += len(std_features[labels==i])*(mean_class - mean_overall).values.reshape(4,1).dot((mean_class - mean_overall).values.reshape(1,4))
    eigen_values, eigen_vectors = np.linalg.eig(np.linalg.inv(SW).dot(SB))
    eigen_pairs=[]
    for i in range(len(eigen_values)):
        eigen_pairs.append((np.abs(eigen_values[i]), eigen_vectors[:,i]))
    eigen_pairs.sort(reverse=True, key=lambda x: x[0])
    eigen_pairs = eigen_pairs[0:n_components]
    projection_matrix=np.hstack([eigen_pairs[i][1].reshape(4,1) for i in range(n_components)]).real
    transformed_data = std_features.dot(projection_matrix)
    transformed_data['species'] = data['species']
    transformed_data.columns = ['LD'+str(i+1) for i in range(n_components)] + ['species']
    return transformed_data

In [45]:
#for 2 components
transformed_data = LDA(iris,2)
fig = px.scatter(transformed_data, x="LD1", y="LD2", color="species", title="LDA on IRIS dataset")
fig.show()

In [46]:
#for 3 components
transformed_data = LDA(iris,3)
fig = px.scatter_3d(transformed_data, x="LD1", y="LD2", z="LD3", color="species", title="LDA on IRIS dataset")
fig.show()

In [48]:
#for 4 components
transformed_data = LDA(iris,4)
fig = px.scatter_matrix(transformed_data, dimensions=["LD1", "LD2", "LD3", "LD4"], color="species", title="LDA on IRIS dataset")
fig.show()