In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [6]:
# Create a Spark session
spark = SparkSession.builder.appName("PimaRandomForestDecisionTree").getOrCreate()

# Read CSV file (assuming the first row is header and types are inferred)
df = spark.read.csv("/content/pima.csv", header=True, inferSchema=True)
df.show(5)
df.printSchema()

+----+----+----+----+----+----+-----+---+-----+
|preg|plas|pres|skin|test|mass| pedi|age|class|
+----+----+----+----+----+----+-----+---+-----+
|   6| 148|  72|  35|   0|33.6|0.627| 50|    1|
|   1|  85|  66|  29|   0|26.6|0.351| 31|    0|
|   8| 183|  64|   0|   0|23.3|0.672| 32|    1|
|   1|  89|  66|  23|  94|28.1|0.167| 21|    0|
|   0| 137|  40|  35| 168|43.1|2.288| 33|    1|
+----+----+----+----+----+----+-----+---+-----+
only showing top 5 rows

root
 |-- preg: integer (nullable = true)
 |-- plas: integer (nullable = true)
 |-- pres: integer (nullable = true)
 |-- skin: integer (nullable = true)
 |-- test: integer (nullable = true)
 |-- mass: double (nullable = true)
 |-- pedi: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- class: integer (nullable = true)



In [7]:
# Assume the last column is the target
columns = df.columns
feature_columns = columns[:-1]  # All columns except the target
target_column = columns[-1]
print("Target column:", target_column)

# Rename the target column to 'label' for ML
df = df.withColumnRenamed(target_column, "label")

# Assemble features into a single vector column 'features'
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df).select("features", "label")

# Split the dataset into training and testing sets (70/30 split)
train, test = data.randomSplit([0.7, 0.3], seed=42)

Target column: class


In [8]:
##############################
# Random Forest Classifier
##############################

# Initialize and train the Random Forest model
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, seed=42)
rf_model = rf.fit(train)

# Generate predictions on the test set
rf_predictions = rf_model.transform(test)

# Evaluate the model using AUC (Area Under ROC)
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
rf_auc = evaluator_auc.evaluate(rf_predictions)
print("Random Forest AUC:", rf_auc)

# Compute precision, recall, and accuracy using MulticlassMetrics
# MulticlassMetrics expects an RDD of (prediction, label) pairs
rf_pred_rdd = rf_predictions.select("prediction", "label").rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics_rf = MulticlassMetrics(rf_pred_rdd)
rf_precision = metrics_rf.precision(1.0)  # Assuming the positive class is labeled 1.0
rf_recall = metrics_rf.recall(1.0)
rf_accuracy = metrics_rf.accuracy
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest Accuracy:", rf_accuracy)

# Print feature importances from the Random Forest model
print("Random Forest Feature Importances:")
for col, imp in zip(feature_columns, rf_model.featureImportances):
    print(f"{col}: {imp}")


Random Forest AUC: 0.8238573021181713
Random Forest Precision: 0.6666666666666666
Random Forest Recall: 0.5797101449275363
Random Forest Accuracy: 0.7537688442211056
Random Forest Feature Importances:
preg: 0.07423774049802578
plas: 0.33818285120085156
pres: 0.048528078434346214
skin: 0.03609356770004231
test: 0.051652411396612724
mass: 0.16982738365242017
pedi: 0.0810134543900141
age: 0.2004645127276872


In [9]:
##############################
# Decision Tree Classifier
##############################

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", seed=42)
dt_model = dt.fit(train)

# Generate predictions on the test set
dt_predictions = dt_model.transform(test)

# Evaluate the Decision Tree model using AUC
dt_auc = evaluator_auc.evaluate(dt_predictions)
print("Decision Tree AUC:", dt_auc)

# Compute precision, recall, and accuracy for Decision Tree
dt_pred_rdd = dt_predictions.select("prediction", "label").rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics_dt = MulticlassMetrics(dt_pred_rdd)
dt_precision = metrics_dt.precision(1.0)
dt_recall = metrics_dt.recall(1.0)
dt_accuracy = metrics_dt.accuracy
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree Accuracy:", dt_accuracy)

Decision Tree AUC: 0.6408026755852843
Decision Tree Precision: 0.6724137931034483
Decision Tree Recall: 0.5652173913043478
Decision Tree Accuracy: 0.7537688442211056


In [10]:
# Optionally, print the Decision Tree structure for debugging/interpretability
print("Decision Tree Model Structure:")
print(dt_model.toDebugString)

# Stop the Spark session when finished
spark.stop()

Decision Tree Model Structure:
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9d1a8900dc3a, depth=5, numNodes=37, numClasses=2, numFeatures=8
  If (feature 1 <= 137.5)
   If (feature 7 <= 30.5)
    If (feature 0 <= 6.5)
     Predict: 0.0
    Else (feature 0 > 6.5)
     If (feature 2 <= 15.0)
      If (feature 7 <= 29.5)
       Predict: 0.0
      Else (feature 7 > 29.5)
       Predict: 1.0
     Else (feature 2 > 15.0)
      Predict: 1.0
   Else (feature 7 > 30.5)
    If (feature 5 <= 27.35)
     If (feature 2 <= 93.0)
      Predict: 0.0
     Else (feature 2 > 93.0)
      Predict: 1.0
    Else (feature 5 > 27.35)
     If (feature 1 <= 94.5)
      If (feature 6 <= 0.764)
       Predict: 0.0
      Else (feature 6 > 0.764)
       Predict: 1.0
     Else (feature 1 > 94.5)
      If (feature 6 <= 0.558)
       Predict: 0.0
      Else (feature 6 > 0.558)
       Predict: 1.0
  Else (feature 1 > 137.5)
   If (feature 1 <= 155.5)
    If (feature 2 <= 89.0)
     If (feature 2 <= 57.0)
