In [None]:
import pandas as pd
train = pd.read_csv('optdigits.tra', header=None)
test = pd.read_csv('optdigits.tes', header=None)
df = pd.concat([train, test], ignore_index=True)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

: 

In [None]:
X

In [None]:
X.shape

In [None]:
y.value_counts()

In [None]:
X.isnull().sum()

In [None]:
X.describe()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, axes = plt.subplots(2, 5, figsize=(10,4))
for ax, idx in zip(axes.flatten(), np.random.choice(len(X), 10, replace=False)):
    ax.imshow(X.iloc[idx].values.reshape(8,8), cmap='gray_r')
    ax.set_title(f"Label: {y.iloc[idx]}")
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.decomposition import PCA

pca2 = PCA(n_components=2)
X_pca2 = pca2.fit_transform(X_scaled)

print("Explained variance ratio:", pca2.explained_variance_ratio_)
print("Total variance retained:", pca2.explained_variance_ratio_.sum())


In [None]:
import seaborn as sns
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca2[:,0], y=X_pca2[:,1], hue=y, palette='tab10', s=40, alpha=0.8)
plt.title("PCA (2D) of Handwritten Digits")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

In [None]:
pca = PCA().fit(X_scaled)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid()
plt.show()

In [None]:
pca20 = PCA(n_components=20)
X_pca20 = pca20.fit_transform(X_scaled)
print(f"Variance retained: {pca20.explained_variance_ratio_.sum():.2f}")

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(X_pca20)
print("ARI:", adjusted_rand_score(y, clusters))
print("NMI:", normalized_mutual_info_score(y, clusters))

In [None]:
centroids = kmeans.cluster_centers_[:, :64]  # if you used 64-feature PCA
for i in range(10):
    plt.subplot(2,5,i+1)
    plt.imshow(pca20.inverse_transform(kmeans.cluster_centers_[i]).reshape(8,8), cmap='gray_r')
    plt.title(f"C{i}")
    plt.axis('off')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_pca20, y, test_size=0.3, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))