In [235]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### Importing libraries

In [236]:
import numpy as np
import pandas as pd

### Importing dataset
#### We are going to use sklearn and iris dataset in this example.

In [237]:
from sklearn.datasets import load_iris

In [238]:
iris = pd.DataFrame(load_iris().data)

In [239]:
iris.columns = load_iris().feature_names
iris['Class'] = load_iris().target

In [240]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


#### Alternate way to load the dataset

In [241]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"  
names = ['Sepal_length', 'Sepal_width', 'Petal_length', 'Petal_width', 'Class']  
dataset = pd.read_csv(url, names=names)  

In [242]:
dataset.head()

Unnamed: 0,Sepal_length,Sepal_width,Petal_length,Petal_width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Applying PCA

In [243]:
X = dataset.drop('Class', 1)  
y = dataset['Class'] 

In [244]:
# Splitting the dataset into training and testing

In [245]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [249]:
# PCA assumes that the data is centered around the mean.That is the reason we will run the following lines of code before
# running PCA

In [250]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [251]:
from sklearn.decomposition import PCA
pca = PCA()  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test) 

In [252]:
pca.explained_variance_ratio_  

array([0.25, 0.25, 0.25, 0.25])

In [253]:
# pca.explained_variance_ratio_ tells us the variance preserved by each principal componnets.
# In this case PC1 & PC2 preserves the maximum amount of variance

### Training Algorithm with different combinations of principal components

In [254]:
# We will use different combination of PCS & train a classification algorithm to see the effects of principal components.

#### Training with 1 PC

In [255]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

In [256]:
from xgboost import XGBClassifier

In [257]:
model = XGBClassifier(random_state=0)

In [258]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [259]:
y_pred = model.predict(X_test) 

In [260]:
from sklearn.metrics import classification_report

In [261]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       0.56      0.45      0.50        11
Iris-versicolor       0.27      0.23      0.25        13
 Iris-virginica       0.20      0.33      0.25         6

      micro avg       0.33      0.33      0.33        30
      macro avg       0.34      0.34      0.33        30
   weighted avg       0.36      0.33      0.34        30



#### Training with 2 PC

In [262]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [263]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [264]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

In [265]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [266]:
y_pred = model.predict(X_test) 

In [267]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       0.85      0.85      0.85        13
 Iris-virginica       0.67      0.67      0.67         6

      micro avg       0.87      0.87      0.87        30
      macro avg       0.84      0.84      0.84        30
   weighted avg       0.87      0.87      0.87        30



#### Training with all PC

In [268]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [269]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [270]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

In [271]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [272]:
y_pred = model.predict(X_test) 

In [273]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      1.00      1.00        13
 Iris-virginica       1.00      1.00      1.00         6

      micro avg       1.00      1.00      1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30



### Training on oroginal dataset

In [274]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [275]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [276]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [277]:
y_pred = model.predict(X_test) 

In [278]:
print(classification_report(y_true=y_test,y_pred=y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      1.00      1.00        13
 Iris-virginica       1.00      1.00      1.00         6

      micro avg       1.00      1.00      1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30

