# Dimesionality Reduction using Pricipal component Analysis(PCA)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
#loading digits data set which are images 64x64
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
digits = load_digits()
X = digits.data          # shape: (n_samples, 64)
y = digits.target        # labels: 0..9
images = digits.images   # shape: (n_samples, 8, 8)

print("X shape:", X.shape)
print("Each image has 8x8 = 64 pixels/features")


#### Plot some random digits with label to see how good they are . These are given for training

In [None]:
fig, axes = plt.subplots(2, 6, figsize=(10, 4))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(images[i], cmap="gray")
    ax.set_title(f"Label: {y[i]}")
    ax.axis("off")
plt.tight_layout()
plt.show()


#### These are tiny 8×8 pictures. The computer sees them as 64 numbers.

### Why scaling matters for PCA

#### PCA is based on variance. Scaling ensures features are comparable.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


#### PCA works best when features are standardized (mean 0, std 1), especially when feature scales differ.

##### What PCA needs is that it assumes data is centered around zero (mean =zero) and Variance is measured around the mean
##### This can be made by standard scaler only

In [None]:

# using only two components
pca_2 = PCA(n_components=2, random_state=42)
X_2d = pca_2.fit_transform(X_scaled)

print("Explained variance by PC1 and PC2:", pca_2.explained_variance_ratio_)
print("Total explained variance (2D):", pca_2.explained_variance_ratio_.sum())


In [None]:
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, s=15, alpha=0.8)
plt.xlabel("PC1 (most important summary direction)")
plt.ylabel("PC2 (second most important summary direction)")
plt.title("Digits projected from 64D → 2D using PCA")

cbar = plt.colorbar(scatter)
cbar.set_label("Digit label")
plt.show()


In [None]:
pca_full = PCA().fit(X_scaled)
cum_var = np.cumsum(pca_full.explained_variance_ratio_)

plt.figure(figsize=(8, 5))
plt.plot(cum_var, marker="o", markersize=3)
plt.xlabel("Number of PCA components kept")
plt.ylabel("Cumulative explained variance")
plt.title("How much information is kept as we add more components?")
plt.grid(True)
plt.show()

# Common checkpoints:
for k in [2, 10, 20, 30, 40]:
    print(f"{k} components keep about {cum_var[k-1]*100:.1f}% of variance")


In [None]:
def pca_reconstruct(X_scaled, n_components):
    pca = PCA(n_components=n_components, random_state=42)
    X_reduced = pca.fit_transform(X_scaled)
    X_recon = pca.inverse_transform(X_reduced)
    return X_reduced, X_recon, pca

components_list = [2, 10, 20, 40]
recons = {}

for k in components_list:
    _, X_recon, pca_k = pca_reconstruct(X_scaled, k)
    recons[k] = (X_recon, pca_k)
    print(f"{k} components keep {pca_k.explained_variance_ratio_.sum()*100:.1f}% variance")


In [None]:
n_show = 8  # number of digits to display

fig, axes = plt.subplots(len(components_list) + 1, n_show, figsize=(12, 6))

# Row 1: original
for i in range(n_show):
    axes[0, i].imshow(X_scaled[i].reshape(8, 8), cmap="gray")
    axes[0, i].set_title(f"Orig: {y[i]}")
    axes[0, i].axis("off")
axes[0, 0].set_ylabel("Original", rotation=90, labelpad=30)

# Rows: reconstructions
for row, k in enumerate(components_list, start=1):
    X_recon, _ = recons[k]
    for i in range(n_show):
        axes[row, i].imshow(X_recon[i].reshape(8, 8), cmap="gray")
        axes[row, i].axis("off")
    axes[row, 0].set_ylabel(f"{k} PCs", rotation=90, labelpad=30)

plt.suptitle("What gets lost when we reduce dimensions? (Reconstruction)", y=1.02)
plt.tight_layout()
plt.show()


##### The first row shows the original images, while rows 2–5 show different images reconstructed after compressing each one into just 2 PCA components, revealing how much visual information is lost when dimensionality is reduced too aggressively.

#### With 2 components, digits look blurry and lose detail

#### With 10–20 components, digits become recognizable

#### With 40 components, digits are close to the original

#### This visually explains:fewer dimensions = more compression = more information loss

## Lets see how this dimensionlity reduction affects classification

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import time

lr_original = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    multi_class="auto"
)

start = time.time()
lr_original.fit(X_train, y_train)
train_time_original = time.time() - start

y_pred_orig = lr_original.predict(X_test)

print("Logistic Regression on Original Features (64D)")
print("Accuracy:", accuracy_score(y_test, y_pred_orig))
print("Training time (sec):", round(train_time_original, 3))
print(classification_report(y_test, y_pred_orig))


### Note that data set is balanced from the support column and recall is also very good. So it is a balanced class

### Now lets increase the PCA componets to 20 to compare the calssification fairness . Two would be very low

In [None]:
pca_20 = PCA(n_components=20, random_state=42)

X_train_pca = pca_20.fit_transform(X_train)
X_test_pca = pca_20.transform(X_test)

print("Original train shape:", X_train.shape)
print("PCA-reduced train shape:", X_train_pca.shape)
print("Variance retained:", round(pca_20.explained_variance_ratio_.sum(), 3))


In [None]:
lr_pca = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    multi_class="auto"
)

start = time.time()
lr_pca.fit(X_train_pca, y_train)
train_time_pca = time.time() - start

y_pred_pca = lr_pca.predict(X_test_pca)

print("Logistic Regression on PCA-reduced Features (20D)")
print("Accuracy:", accuracy_score(y_test, y_pred_pca))
print("Training time (sec):", round(train_time_pca, 3))
print(classification_report(y_test, y_pred_pca))


### This provides us the best advantage of PCA which is the training time