# Dimensionally Reduction with PCA-Principal Component Analysis

In [1]:
# Principal Component Analysis is an unsupervised dimensionality reduction technique that
# does not depend on labels of a dataset.

In [2]:
# Principal Component Analysis priotizes features on the basis of their ability to cause maximum variance in the output

# Advantages
# 1. Correlated features can be detected and removed using PCA
# 2. Reduces overfitting because of a reduction in the number of features
# 3. Model training can be expedited

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

iris_df = sns.load_dataset("iris")

iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
X = iris_df.drop(['species'], axis = 1)

Y = iris_df['species']

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

Y = le.fit_transform(Y)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [7]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
from sklearn.decomposition import PCA

pca = PCA()

X_train = pca.fit_transform(X_train)

X_test = pca.transform(X_test)

In [9]:
# printing variance ratios

variance_ratios = pca.explained_variance_ratio_
print(variance_ratios)

[0.72229951 0.2397406  0.03335483 0.00460506]


In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg.fit(X_train, Y_train)

Y_pred = lg.predict(X_test)

from sklearn.metrics import accuracy_score

print(accuracy_score(Y_test, Y_pred))

0.8666666666666667
