# Principal Component Analysis (PCA)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler #This is for standardizing data
from sklearn.decomposition import PCA #This is for Principal Component Analysis (PCA)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data=pd.read_csv("D:\\Workshops\\Python for Data Science Comprehensive Workshop\\Part 04 - Machine Learning\\Datasets\\Boston.CSV")
data.head()

In [None]:
datx=data.iloc[:,:12]
datx.head()

# Checking the correlation among variables

In [None]:
datx.corr() #This returns the correcation matrix for independent variables

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(datx.corr(),annot=True,vmin=-1,vmax=1)
plt.show()

# Standardizing data including independent & response data

# Creating standardizing object

In [None]:
stand=StandardScaler()
stand

# Training the standardizing object with data

In [None]:
stand.fit(data)

# Standardizing data

In [None]:
sdata=stand.fit_transform(data)
sdata

# Splitting data into independent & response

In [None]:
x=sdata[:,:12]
y=sdata[:,12]

# First we have to find the number of PC s. So first we get all the PC s

In [None]:
pc=PCA(n_components=12)

# Training the PC with independent data

In [None]:
pc.fit(x)

# How these PC s are explaning the variance

In [None]:
pc.explained_variance_ #How much variance,caputured individually

In [None]:
pc.explained_variance_ratio_.cumsum() #How mucb variance, captured together

In [None]:
sns.lineplot(range(1,1+pc.explained_variance_.shape[0]),pc.explained_variance_)
plt.show()

# Select the number of Pc s which capture most of the variance

In [None]:
pc=PCA(n_components=7)

# Training the PCs with independent data

In [None]:
pc.fit(x)

# Get the new independent data with PC s

In [None]:
x=pc.fit_transform(x)

In [None]:
x #This independent data can be used for model fitting

In [None]:
x.shape #Now the number of dimensiona has been reduced

# Factor Analysis (FA)

# Importing the required library

In [None]:
# import pip
# pip.main(["install","factor_analyzer"])

In [None]:
from factor_analyzer import FactorAnalyzer #This is for Factor Analysis
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity #For testing the factorability
from factor_analyzer.factor_analyzer import calculate_kmo #For testing the factorability

In [None]:
data=pd.read_csv("D:\\Workshops\\Python for Data Science Comprehensive Workshop\\Part 04 - Machine Learning\\Datasets\\bfi.CSV")
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True) #Remove missing value rows

In [None]:
data.head()

# Testing the factorability

# Bartlett’s Test

In [None]:
chi_square_value,p_value=calculate_bartlett_sphericity(data)
chi_square_value, p_value    #P-Value is less than 0.05

# Kaiser-Meyer-Olkin (KMO) Test

In [None]:
kmo_all,kmo_model=calculate_kmo(data)
kmo_model   #Above 0.6 means better

# Creating the factor analyzer object

In [None]:
fa = FactorAnalyzer(n_factors=25,rotation=None)

# Selecting all the factors without rotations

In [None]:
fa.fit(data)

# Eigen values and normalized eigen vectors

In [None]:
ev, v = fa.get_eigenvalues()

In [None]:
ev

# Scree plot

In [None]:
plt.plot(range(1,data.shape[1]+1),ev)
plt.scatter(range(1,data.shape[1]+1),ev)
plt.show()

# Selecting suitable number of factors and refit

In [None]:
fa = FactorAnalyzer(n_factors=6,rotation=None)
fa.fit(data)

# Factor loadings

In [None]:
fa.loadings_

# SS Loadings, Proportion Var and Cumulative Var of factors

In [None]:
fa.get_factor_variance()

# Communalities

In [None]:
fa.get_communalities()

# With factor rotation

In [None]:
fa = FactorAnalyzer(n_factors=6,rotation="varimax")
fa.fit(data)

In [None]:
np.set_printoptions(suppress=True) #This will remove the scientific notation

In [None]:
fa.loadings_

In [None]:
ind=data.columns
df=pd.DataFrame(fa.loadings_,columns=range(1,7),index=ind)
df

In [None]:
fa.get_factor_variance()

In [None]:
fa.get_communalities()

# Select more suitable factors

In [None]:
fa = FactorAnalyzer(n_factors=5,rotation="varimax")
fa.fit(data)

In [None]:
x_latent=fa.fit_transform(data)
x_latent