# Principal Component Analysis

Author: Prof. Sandro Camargo <github.com/sandrocamargo>

Data Mining Course https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213

This script uses the basic concepts of PCA.

In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/53/iris

To open this script in your google colab environment, [click here](https://colab.research.google.com/github/Sandrocamargo/data-mining/blob/main/Python/md07_clustering.ipynb).

In [None]:
# ------------------------------------------------------------------------------
# Import libraries
# ------------------------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# ------------------------------------------------------------------------------
# Loading Iris Dataset
# ------------------------------------------------------------------------------
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

In [None]:
# ------------------------------------------------------------------------------
# Standardize the data (important for PCA!)
# ------------------------------------------------------------------------------
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
# ------------------------------------------------------------------------------
# Perform PCA, reducing dimensions to 2
# ------------------------------------------------------------------------------
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)

print("Explained variance ratio:")
print(pca.explained_variance_ratio_)

In [None]:
# ------------------------------------------------------------------------------
# Plot PCA results
# ------------------------------------------------------------------------------
plt.figure()
colors = ['r', 'g', 'b']

for i, target in enumerate(target_names):
    idx = (y == i)
    plt.scatter(X_pca[idx, 0], X_pca[idx, 1],
                color=colors[i], label=target)

# Vari√¢ncia explicada em porcentagem
explained_var = pca.explained_variance_ratio_ * 100

plt.xlabel(f'PC 1 ({explained_var[0]:.2f}%)')
plt.ylabel(f'PC 2 ({explained_var[1]:.2f}%)')
plt.legend()
plt.title('PCA of Iris Dataset')

# ------------------------------------------------------------------------------
# 6. Display the plot
# ------------------------------------------------------------------------------
plt.show()


In [None]:
# ------------------------------------------------------------------------------
# Plotting the components to see contributions of original variables
# ------------------------------------------------------------------------------

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

for i in range(2):
    ax[i].bar(feature_names, pca.components_[i], color='skyblue')
    ax[i].set_title(f'PC {i+1}')
    ax[i].set_xticklabels(feature_names, rotation=45, ha='right')
    ax[i].set_ylabel('Weight')

fig.suptitle('Feature Loading on Principal Components')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA

# Loading Iris
iris = load_iris()
X = iris.data
y = iris.target

# Perform PCA to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot PCA components
plt.figure()
for i, color in enumerate(['r', 'g', 'b']):
    idx = (y == i)
    plt.scatter(X_pca[idx, 0], X_pca[idx, 1], color=color, label=iris.target_names[i])

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend()
plt.title('Iris projected into first two principal components')
plt.show()


In [None]:
import numpy as np

plt.figure()
plt.bar(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_*100)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance (%)')
plt.title('Explained variance by principal components')
plt.show()


In [None]:
import seaborn as sns

loadings = pca.components_

fig, ax = plt.subplots()
sns.heatmap(loadings, cmap='coolwarm', annot=True, xticklabels=iris.feature_names, yticklabels=['PC1', 'PC2'])
ax.set_title('Feature Loading on Principal Components')
plt.show()
