In [1]:
import numpy as np
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:

X, y = fetch_openml('mnist_784', version=1, return_X_y=True, parser='auto')
X = X.astype('float32')

In [3]:
X /= 255.0

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(f"Data loaded and split: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
print(f"Each image is represented by {X_train.shape[1]} features (pixels)")

Data loaded and split: 56000 training samples, 14000 test samples
Each image is represented by 784 features (pixels)


In [6]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [7]:
start_time = time.time()
rf_clf.fit(X_train, y_train)
original_training_time = time.time() - start_time

In [8]:
print(f"Training time: {original_training_time:.2f} seconds")

Training time: 4.89 seconds


In [9]:
start_time = time.time()
y_pred = rf_clf.predict(X_test)
original_prediction_time = time.time() - start_time

In [10]:
original_accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction time: {original_prediction_time:.2f} seconds")
print(f"Accuracy: {original_accuracy:.4f}")
print("\nClassification Report (original data):")
print(classification_report(y_test, y_pred))

Prediction time: 0.14 seconds
Accuracy: 0.9675

Classification Report (original data):
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1343
           1       0.98      0.98      0.98      1600
           2       0.95      0.97      0.96      1380
           3       0.96      0.95      0.96      1433
           4       0.96      0.97      0.97      1295
           5       0.97      0.96      0.97      1273
           6       0.98      0.98      0.98      1396
           7       0.97      0.97      0.97      1503
           8       0.96      0.95      0.96      1357
           9       0.96      0.95      0.95      1420

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97     14000



In [11]:
n_components = 100  # Reduce to 100 principal components
pca = PCA(n_components=n_components, random_state=42)

In [12]:
start_time = time.time()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
pca_time = time.time() - start_time

In [13]:
variance_ratio = np.sum(pca.explained_variance_ratio_)
print(f"PCA transformation time: {pca_time:.2f} seconds")
print(f"Number of components: {n_components}")
print(f"Explained variance: {variance_ratio:.4f}")


PCA transformation time: 0.31 seconds
Number of components: 100
Explained variance: 0.9151


In [14]:
rf_clf_pca = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [15]:
start_time = time.time()
rf_clf_pca.fit(X_train_pca, y_train)
pca_training_time = time.time() - start_time

In [16]:
print(f"Training time with PCA: {pca_training_time:.2f} seconds")

Training time with PCA: 11.30 seconds


In [17]:
start_time = time.time()
y_pred_pca = rf_clf_pca.predict(X_test_pca)
pca_prediction_time = time.time() - start_time

In [18]:
pca_accuracy = accuracy_score(y_test, y_pred_pca)
print(f"Prediction time with PCA: {pca_prediction_time:.2f} seconds")
print(f"Accuracy with PCA: {pca_accuracy:.4f}")
print("\nClassification Report (PCA):")
print(classification_report(y_test, y_pred_pca))

Prediction time with PCA: 0.07 seconds
Accuracy with PCA: 0.9506

Classification Report (PCA):
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1343
           1       0.97      0.98      0.98      1600
           2       0.94      0.95      0.95      1380
           3       0.92      0.93      0.93      1433
           4       0.94      0.96      0.95      1295
           5       0.94      0.94      0.94      1273
           6       0.97      0.97      0.97      1396
           7       0.96      0.96      0.96      1503
           8       0.94      0.90      0.92      1357
           9       0.94      0.92      0.93      1420

    accuracy                           0.95     14000
   macro avg       0.95      0.95      0.95     14000
weighted avg       0.95      0.95      0.95     14000



In [19]:
print("\n--- Comparison ---")
original_total_time = original_training_time + original_prediction_time
pca_total_time = pca_time + pca_training_time + pca_prediction_time

print(f"Original training time: {original_training_time:.2f} seconds")
print(f"PCA + training time: {pca_time + pca_training_time:.2f} seconds")
print(f"Original prediction time: {original_prediction_time:.2f} seconds")
print(f"PCA prediction time: {pca_prediction_time:.2f} seconds")


--- Comparison ---
Original training time: 4.89 seconds
PCA + training time: 11.61 seconds
Original prediction time: 0.14 seconds
PCA prediction time: 0.07 seconds


In [20]:
training_speedup = original_training_time / pca_training_time
prediction_speedup = original_prediction_time / pca_prediction_time
total_speedup = original_total_time / (pca_total_time)

print(f"\nTraining speedup with PCA: {training_speedup:.2f}x")
print(f"Prediction speedup with PCA: {prediction_speedup:.2f}x")
print(f"Total time without PCA: {original_total_time:.2f} seconds")
print(f"Total time with PCA (including transformation): {pca_total_time:.2f} seconds")
print(f"Total speedup: {total_speedup:.2f}x")

print(f"\nOriginal accuracy: {original_accuracy:.4f}")
print(f"PCA accuracy: {pca_accuracy:.4f}")
print(f"Accuracy difference: {(pca_accuracy - original_accuracy):.4f}")


Training speedup with PCA: 0.43x
Prediction speedup with PCA: 1.99x
Total time without PCA: 5.03 seconds
Total time with PCA (including transformation): 11.68 seconds
Total speedup: 0.43x

Original accuracy: 0.9675
PCA accuracy: 0.9506
Accuracy difference: -0.0169
