In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start Spark
spark = SparkSession.builder.appName("TaxiDecisionTree").getOrCreate()

# Load CSV from /tmp
df = spark.read.csv("/tmp/2019-04.csv", header=True, inferSchema=True)

# Show a few rows
df.show(5)


+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       1| 2019-04-01 00:04:09|  2019-04-01 00:06:35|              1|          0.5|         1|                 N|         239|         239|           1|        4.0|  3.0|    0.5|       1.0|         0.0|                  0.3

In [2]:
# Split the dataset into training and testing sets
trainDF, testDF = df.select(
    col("passenger_count").cast("double"),
    col("PULocationID").alias("pickup").cast("double"),
    col("DOLocationID").alias("dropoff").cast("double"),
    col("total_amount").cast("double")
).randomSplit([0.8, 0.2], seed=42)

# Show confirmation
print(f"Training size: {trainDF.count()}")
print(f"Testing size: {testDF.count()}")


Training size: 5946633
Testing size: 1486506


In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

# Assemble input features into a single vector
assembler = VectorAssembler(
    inputCols=["passenger_count", "pickup", "dropoff"],
    outputCol="features"
)

# Create Decision Tree Regressor
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="total_amount")

# Create pipeline with assembler + regressor
pipeline = Pipeline(stages=[assembler, dtr])


In [4]:
# Train the model
model = pipeline.fit(trainDF)


In [5]:
# Generate predictions
predictions = model.transform(testDF)

# Show first 10 rows with input features and predicted total_amount
predictions.select(
    "passenger_count", "pickup", "dropoff", "total_amount", "prediction"
).show(10)


+---------------+------+-------+------------+------------------+
|passenger_count|pickup|dropoff|total_amount|        prediction|
+---------------+------+-------+------------+------------------+
|            0.0|   1.0|    1.0|       103.3|27.742775399212995|
|            0.0|   4.0|    4.0|         6.8|27.742775399212995|
|            0.0|   4.0|   33.0|       31.55|15.891463655567376|
|            0.0|   4.0|   79.0|         7.8|15.891463655567376|
|            0.0|   4.0|  107.0|        11.8|15.891463655567376|
|            0.0|   4.0|  144.0|        11.3|17.810129891664225|
|            0.0|   4.0|  234.0|        11.0|17.810129891664225|
|            0.0|   7.0|  121.0|        28.8|17.810129891664225|
|            0.0|   7.0|  223.0|         6.8|17.810129891664225|
|            0.0|   7.0|  223.0|         8.3|17.810129891664225|
+---------------+------+-------+------------+------------------+
only showing top 10 rows



In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create evaluator
evaluator = RegressionEvaluator(
    labelCol="total_amount",
    predictionCol="prediction",
    metricName="rmse"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Root Mean Squared Error (RMSE): 324.89
