 Prediction with Decision Trees

Stage 1: Data Preparation and Feature Engineering

In [None]:
import pyspark
import os
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_4').getOrCreate()

data_without_header = spark.read.option("inferSchema", True).option("header", False).csv("data/covtype.data")

from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

colnames = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
            "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
            "Horizontal_Distance_To_Fire_Points"] + \
            [f"Wilderness_Area_{i}" for i in range(4)] + \
            [f"Soil_Type_{i}" for i in range(40)] + \
            ["Cover_Type"]

data = data_without_header.toDF(*colnames).withColumn("Cover_Type", col("Cover_Type").cast(DoubleType()))

(train_data, test_data) = data.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

Stage 2: Feature Transformation and Model Building

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
assembled_train_data = vector_assembler.transform(train_data)

classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type",
                                    featuresCol="featureVector", predictionCol="prediction")
model = classifier.fit(assembled_train_data)

import pandas as pd

pd.DataFrame(model.featureImportances.toArray(), index=input_cols, columns=['importance']).sort_values(by="importance", ascending=False)

predictions = model.transform(assembled_train_data)

Stage 3: Model Evaluation and Metrics Calculation

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1_score = evaluator.setMetricName("f1").evaluate(predictions)

confusion_matrix = predictions.groupBy("Cover_Type").pivot("prediction", range(1, 8)).count().na.fill(0.0).orderBy("Cover_Type")
confusion_matrix.show()

Stage 4: Pipeline Creation and Hyperparameter Tuning

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

pipeline = Pipeline(stages=[vector_assembler, classifier])

paramGrid = ParamGridBuilder().addGrid(classifier.impurity, ["gini", "entropy"]).addGrid(classifier.maxDepth, [1, 20]).addGrid(classifier.maxBins, [40, 300]).addGrid(classifier.minInfoGain, [0.0, 0.05]).build()

multiclassEval = MulticlassClassificationEvaluator().setLabelCol("Cover_Type").setPredictionCol("prediction").setMetricName("accuracy")

validator = TrainValidationSplit(seed=1234, estimator=pipeline, evaluator=multiclassEval,
                                  estimatorParamMaps=paramGrid, trainRatio=0.9)

validator_model = validator.fit(train_data)

Stage 5: Model Selection and Testing

In [None]:
best_model = validator_model.bestModel
best_model.stages[1].extractParamMap()

metrics_and_params.sort(key=lambda x: x[0], reverse=True)
metrics.sort(reverse=True)
print(metrics[0])

multiclassEval.evaluate(best_model.transform(test_data))

Stage 6: Data Transformation for One-Hot Encoding

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def unencode_one_hot(data):
    # Function to unencode one-hot encoded columns for Wilderness Area and Soil Type
    return processed_data

unenc_train_data = unencode_one_hot(train_data)
unenc_train_data.printSchema()
unenc_train_data.groupBy('wilderness').count().show()

Stage 7: Random Forest Classifier with Pipeline and Hyperparameter Tuning

In [None]:
from pyspark.ml.classification import RandomForestClassifier

classifier_rf = RandomForestClassifier(seed=1234, labelCol="Cover_Type",
                                       featuresCol="indexedVector", predictionCol="prediction")

pipeline_rf = Pipeline().setStages([assembler, indexer, classifier_rf])

validator_model_rf = validator.fit(unenc_train_data)

best_model_rf = validator_model_rf.bestModel.stages[2]

feature_importance_list.sort(key=lambda x: x[1], reverse=True)
pprint(feature_importance_list)

unenc_test_data = unencode_one_hot(test_data)
best_model_rf.transform(unenc_test_data.drop("Cover_Type")).select("prediction").show(1)