# Assignment 13: Principal Component Analysis (PCA)

## Dataset: Wine Quality

**Topics Covered:**
- Dimensionality Reduction
- Variance Explained
- Visualization in reduced dimensions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Load data
df = pd.read_csv('wine.csv')
print("Dataset loaded! Shape:", df.shape)
df.head()

In [None]:
# Prepare data
# Separate features and target if applicable
X = df.select_dtypes(include=[np.number])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled! Shape:", X_scaled.shape)

In [None]:
# Apply PCA with all components
pca_full = PCA()
pca_full.fit(X_scaled)

# Explained variance ratio
explained_var = pca_full.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)

print("=== Explained Variance ===")
for i in range(min(10, len(explained_var))):
    print("PC" + str(i+1) + ":", round(explained_var[i]*100, 2), "%  (Cumulative:", round(cumulative_var[i]*100, 2), "%)")

In [None]:
# Scree Plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(range(1, len(explained_var)+1), explained_var, color='steelblue')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')

plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_var)+1), cumulative_var, marker='o')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Variance')
plt.legend()

plt.tight_layout()
plt.savefig('pca_variance.png')
plt.show()

In [None]:
# Apply PCA with 2 components for visualization
pca_2d = PCA(n_components=2)
X_pca = pca_2d.fit_transform(X_scaled)

print("Original dimensions:", X_scaled.shape[1])
print("Reduced dimensions:", X_pca.shape[1])
print("Variance retained:", round(sum(pca_2d.explained_variance_ratio_)*100, 2), "%")

In [None]:
# Visualize in 2D
plt.figure(figsize=(10, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.7, c='steelblue')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - 2D Visualization')
plt.grid(True)
plt.savefig('pca_2d.png')
plt.show()

In [None]:
# Clustering on PCA-transformed data
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)

plt.figure(figsize=(10, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-Means Clustering on PCA Data')
plt.colorbar(label='Cluster')
plt.savefig('pca_clustering.png')
plt.show()

## Summary

**Key Findings:**
- PCA reduced high-dimensional data to 2 components
- First few components capture most variance
- Visualization reveals data structure