The RandomForestClassifier in PySpark is a powerful ensemble learning method used for classification tasks. It operates by constructing multiple decision trees during training and outputting the class that is the mode of the classes (classification) or mean/average prediction (regression) of the individual trees. This approach leverages the wisdom of the crowd, combining the predictions of individual trees to make more accurate and robust classifications. Random Forests are known for their ability to handle high-dimensional data with numerous features, and they often achieve excellent performance without extensive hyperparameter tuning.

About the dataset:


*   Source: The digits dataset is a built-in dataset in the scikit-learn library. You loaded it using from sklearn.datasets import load_digits and digits = load_digits().
*   Content: It's a collection of handwritten digits (0-9) represented as 8x8 grayscale images. Each image is flattened into a 64-dimensional feature vector.
*   Size: The dataset contains 1797 samples.
*  Task: It's typically used for classification tasks, where the goal is to predict the digit in an image based on its pixel values.





In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.datasets import load_digits
import pandas as pd

#Start Spark session
spark = SparkSession.builder.appName("DigitsClassification").getOrCreate()

#Load digits dataset using sklearn
digits = load_digits()
data_pd = pd.DataFrame(data=digits.data, columns=[f"pixel_{i}" for i in range(digits.data.shape[1])])
data_pd["label"] = digits.target

#Convert to Spark DataFrame
data = spark.createDataFrame(data_pd)

#Feature vector assembly
feature_cols = [col for col in data.columns if col.startswith("pixel_")]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", "label")

#Split into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

#Train Random Forest Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
model = rf.fit(train_data)

#Make predictions
predictions = model.transform(test_data)

#Evaluate model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"✅ Accuracy: {accuracy * 100:.2f}%")

# Optional: Show confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Convert to pandas for plotting
preds_pd = predictions.select("label", "prediction").toPandas()
cm = confusion_matrix(preds_pd["label"], preds_pd["prediction"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=digits.target_names)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Digits Classification")
plt.show()

Visualization

In [None]:
import numpy as np
import seaborn as sns

correct = preds_pd[preds_pd["label"] == preds_pd["prediction"]]
class_counts = preds_pd["label"].value_counts().sort_index()
correct_counts = correct["label"].value_counts().sort_index()
accuracy_per_class = (correct_counts / class_counts).fillna(0)

sns.barplot(x=accuracy_per_class.index, y=accuracy_per_class.values)
plt.xlabel("Digit")
plt.ylabel("Accuracy")
plt.title("Accuracy per Digit Class")
plt.ylim(0, 1)
plt.grid(axis='y')
plt.show()

In [None]:
import numpy as np

fig, axs = plt.subplots(2, 5, figsize=(10, 5))
sample = preds_pd.sample(10, random_state=42)

for i, ax in enumerate(axs.flat):
  idx = sample.index[i]
  img = digits.images[idx]
  pred = sample.iloc[i]["prediction"]
  true = sample.iloc[i]["label"]
  ax.imshow(img, cmap="gray")
  ax.set_title(f"Pred: {int(pred)}, True: {int(true)}", fontsize=10)
  ax.axis("off")

plt.suptitle("Sample Predictions")
plt.tight_layout()
plt.show()