In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# load dataset
dataset=pd.read_csv('./iris.csv')
dataset.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Overview of Dataset Characteristics
dataset.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
#Create x and y variables
x = dataset.drop('species', axis=1).values
Y = dataset['species'].values

#Load Library for Training
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)

#Fix the imbalanced Classes
from imblearn.over_sampling import SMOTE
smt=SMOTE(random_state=100)
x_train_smt,y_train_smt = smt.fit_resample(x_train,y_train)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(x_train_smt)  
X_test = sc.transform(x_test)

x_2=sc.transform(x)

#Import Dependencies
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix  

In [6]:
# Determine Number of Components for DR
from sklearn.decomposition import PCA


#PCA Ratios
pca_none = PCA(n_components=None,random_state=100)
X_pca = pca_none.fit(x_2, Y)
pca_var_ratios = pca_none.explained_variance_ratio_

# Create a function
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

#Number of components
n_comppca=select_n_components(pca_var_ratios, 0.95)

In [7]:
#Explained Variance

#Create PCA
pca = PCA(n_components=n_comppca,random_state=100)  
X_trainpca = pca.fit_transform(X_train, y_train_smt)  
X_testpca = pca.transform(X_test)  

#Explained Variance and Plot
print('Explained Variance-PCA:',np.round(pca.explained_variance_ratio_,2))

Explained Variance-PCA: [0.73 0.23]


In [8]:
# Predict on test data on Random Forest Classifier - PCA
classifier = RandomForestClassifier(n_estimators=100,max_depth=6, random_state=100)
classifier.fit(X_trainpca, y_train_smt)
y_pred = classifier.predict(X_testpca) 

#Evaluate the Algorithm
target_names=['setosa','versicolor','virginica']
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred,target_names=target_names))
print('\n Number of Components Used:', n_comppca)

[[10  0  0]
 [ 0 10  0]
 [ 0  3  7]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.77      1.00      0.87        10
   virginica       1.00      0.70      0.82        10

    accuracy                           0.90        30
   macro avg       0.92      0.90      0.90        30
weighted avg       0.92      0.90      0.90        30


 Number of Components Used: 2


In [9]:
#Feature Distribution -PCA (Using Optimum Number)

'''
Each column of the matrix shows the weights in the linear 
combination which obtains the corresponding principal component
'''
index=dataset.drop('species', axis=1).columns
FD=pd.DataFrame(np.round(pca.components_.T,2),index)
FD.columns += 1
FD.add_prefix('PC_')

Unnamed: 0,PC_1,PC_2
SepalLengthCm,0.52,0.37
SepalWidthCm,-0.27,0.93
PetalLengthCm,0.58,0.03
PetalWidthCm,0.56,0.07


### 2D PCA Scatter Plot

In [13]:
import plotly.express as px
from sklearn.decomposition import PCA

df = px.data.iris()
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=df['species'])
fig.show()

### 3D PCA

In [14]:
import plotly.express as px
from sklearn.decomposition import PCA

df = px.data.iris()
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['species'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()