In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv("/content/mnistdataset.csv")

X = data.iloc[:, :-1].values   # pixel features
y = data.iloc[:, -1].values    # labels



In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
components_list = [2, 10, 30, 50]
explained_variances = {}

for n in components_list:
    pca = PCA(n_components=n)
    pca.fit(X_scaled)
    explained_variances[n] = np.sum(pca.explained_variance_ratio_)

# Print explained variance
for k, v in explained_variances.items():
    print(f"PCA components = {k}, Explained Variance = {v:.4f}")

In [None]:
pca_full = PCA().fit(X_scaled)

plt.figure()
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Cumulative Explained Variance")
plt.show()

In [None]:
best_n = 50
pca = PCA(n_components=best_n)
X_pca = pca.fit_transform(X_scaled)


# Train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

X_train_pca, X_test_pca, _, _ = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:


lr_original = LogisticRegression(max_iter=1000)
lr_original.fit(X_train, y_train)
y_pred_original = lr_original.predict(X_test)



lr_pca = LogisticRegression(max_iter=1000)
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)


In [None]:
acc_original = accuracy_score(y_test, y_pred_original)
acc_pca = accuracy_score(y_test, y_pred_pca)

print(f"Accuracy (Original Data): {acc_original:.4f}")
print(f"Accuracy (PCA Reduced Data): {acc_pca:.4f}")


In [None]:
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap="tab10", s=5)
plt.legend(*scatter.legend_elements(), title="Digits", bbox_to_anchor=(1.05, 1))
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.title("PCA 2D Visualization of MNIST")
plt.show()