In [1]:
from sklearn.datasets import fetch_openml

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target


In [2]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
import time

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier and time the process
start_time = time.time()
rf_clf.fit(X_train, y_train)
end_time = time.time()

# Calculate training time
training_time_original = end_time - start_time
print(f"Training time (original): {training_time_original:.2f} seconds")

Training time (original): 34.63 seconds


In [4]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = rf_clf.predict(X_test)

# Calculate accuracy
accuracy_original = accuracy_score(y_test, y_pred)
print(f"Accuracy (original): {accuracy_original:.4f}")

Accuracy (original): 0.9673


In [8]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components= 100, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original number of features: {X_train.shape[1]}")
print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")

Original number of features: 784
Reduced number of features after PCA: 100


In [9]:
# Initialize the Random Forest classifier
rf_clf_pca = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier and time the process
start_time = time.time()
rf_clf_pca.fit(X_train_pca, y_train)
end_time = time.time()

# Calculate training time
training_time_pca = end_time - start_time
print(f"Training time (PCA): {training_time_pca:.2f} seconds")

Training time (PCA): 91.22 seconds


In [10]:
# Predict on the test set
y_pred_pca = rf_clf_pca.predict(X_test_pca)

# Calculate accuracy
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Accuracy (PCA): {accuracy_pca:.4f}")

Accuracy (PCA): 0.9487


# Conclusion

   - PCA reduces the training time significantly by reducing the dimensionality of the dataset.

   - The performance (accuracy) of the Random Forest classifier remains almost the same or slightly lower after PCA.

   - PCA is a useful technique for speeding up training when working with high-dimensional datasets like MNIST.