In [1]:
import time
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForestClassifier without PCA
start_time = time.time()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
train_time_without_pca = time.time() - start_time

# Predict and evaluate
y_pred = rf.predict(X_test)
accuracy_without_pca = accuracy_score(y_test, y_pred)

# Apply PCA (reduce to 50 components)
pca = PCA(n_components=50, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train RandomForestClassifier with PCA
start_time = time.time()
rf_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)
train_time_with_pca = time.time() - start_time

# Predict and evaluate
y_pred_pca = rf_pca.predict(X_test_pca)
accuracy_with_pca = accuracy_score(y_test, y_pred_pca)

train_time_without_pca, accuracy_without_pca, train_time_with_pca, accuracy_with_pca


(57.7534441947937, 0.9672857142857143, 91.24002575874329, 0.9527142857142857)