In [661]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### Importing libraries

In [662]:
import numpy as np
import pandas as pd
import warnings

In [663]:
warnings.filterwarnings(action='ignore')

### Importing dataset
#### We are going to use sklearn and iris dataset in this example.

In [664]:
from sklearn.datasets import load_iris

In [665]:
iris = pd.DataFrame(load_iris().data)

In [666]:
iris.columns = load_iris().feature_names
iris['Class'] = load_iris().target

In [667]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


#### Alternate way to load the dataset

In [668]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"  
names = ['Sepal_length', 'Sepal_width', 'Petal_length', 'Petal_width', 'Class']  
dataset = pd.read_csv(url, names=names)  

In [669]:
dataset.head()

Unnamed: 0,Sepal_length,Sepal_width,Petal_length,Petal_width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Applying PCA

In [670]:
X = dataset.drop('Class', 1)  
y = dataset['Class'] 

In [671]:
# Splitting the dataset into training and testing

In [672]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [673]:
# PCA assumes that the data is centered around the mean.That is the reason we will run the following lines of code before
# running PCA

In [674]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [675]:
from sklearn.decomposition import PCA
pca = PCA()  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test) 

In [676]:
pca.explained_variance_ratio_  

array([0.72226528, 0.23974795, 0.03338117, 0.0046056 ])

In [677]:
# pca.explained_variance_ratio_ tells us the variance preserved by each principal componnets.
# In this case PC1 & PC2 preserves the maximum amount of variance

### Training Algorithm with different combinations of principal components

In [678]:
# We will use different combination of PCS & train a classification algorithm to see the effects of principal components.

#### Training with 1 PC

In [679]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

In [680]:
from sklearn.ensemble import RandomForestClassifier

In [681]:
# from xgboost import XGBClassifier

In [682]:
model = RandomForestClassifier(max_depth=2,random_state=42)

In [683]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [684]:
y_pred = model.predict(X_test) 

In [685]:
from sklearn.metrics import classification_report

In [686]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      0.77      0.87        13
 Iris-virginica       0.67      1.00      0.80         6

      micro avg       0.90      0.90      0.90        30
      macro avg       0.89      0.92      0.89        30
   weighted avg       0.93      0.90      0.90        30



In [687]:
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy {}'.format(accuracy_score(y_test, y_pred))) 

[[11  0  0]
 [ 0 10  3]
 [ 0  0  6]]
Accuracy 0.9


#### Training with 2 PC

In [688]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [689]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [690]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

In [691]:
from sklearn.ensemble import RandomForestClassifier

In [692]:
model = RandomForestClassifier(max_depth=2,random_state=42)

In [693]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [694]:
y_pred = model.predict(X_test) 

In [695]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      0.77      0.87        13
 Iris-virginica       0.67      1.00      0.80         6

      micro avg       0.90      0.90      0.90        30
      macro avg       0.89      0.92      0.89        30
   weighted avg       0.93      0.90      0.90        30



In [696]:
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy {}'.format(accuracy_score(y_test, y_pred))) 

[[11  0  0]
 [ 0 10  3]
 [ 0  0  6]]
Accuracy 0.9


### Training on original dataset

In [697]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [698]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [699]:
from sklearn.ensemble import RandomForestClassifier

In [700]:
model = RandomForestClassifier(max_depth=2,random_state=42)

In [701]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [702]:
y_pred = model.predict(X_test) 

In [703]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       0.93      1.00      0.96        13
 Iris-virginica       1.00      0.83      0.91         6

      micro avg       0.97      0.97      0.97        30
      macro avg       0.98      0.94      0.96        30
   weighted avg       0.97      0.97      0.97        30



In [704]:
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy {}'.format(accuracy_score(y_test, y_pred))) 

[[11  0  0]
 [ 0 13  0]
 [ 0  1  5]]
Accuracy 0.9666666666666667


In [705]:
# We are able to achieve 90 % acuuracy by using only one PC which is not bad as compared to the accuracy result from the whole
# dataset