# Decision Tree Regressor with Pyspark's Structured API

Importing necessary packages

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, udf
from math import radians, sin, cos, sqrt, atan2
from pyspark.sql.types import DoubleType

## 1. Preparation

Let's initialize a Spark session:

In [2]:
#pyspark init
builder = SparkSession.builder\
            .appName('taxi_duration_highAPI')\
            .config("spark.driver.memory", "4g")\
            .config("spark.executor.memory", "4g")
spark = builder.getOrCreate()

your 131072x1 screen size is bogus. expect trouble
25/04/11 23:18:38 WARN Utils: Your hostname, DESKTOP-0H87CFM resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/11 23:18:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 23:18:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Read input datasets for training and testing:

In [3]:
#Read input files
raw_train_data = spark.read.csv('train.csv', header=True, inferSchema=True)
raw_test_data = spark.read.csv('test.csv', header=True, inferSchema=True)

                                                                                

## 2. Data preprocessing

Parse timestamp features in the dataset:

In [4]:
#Cast the pickup string values of training data into timestamps.
casted_train_data = raw_train_data.withColumns({
                        'pickup_datetime' : raw_train_data['pickup_datetime'].cast('timestamp'),
                    })

#Cast the pickup string values of testing data into timestamps.
casted_test_data = raw_test_data.withColumns({
                        'pickup_datetime' : raw_test_data['pickup_datetime'].cast('timestamp'),
                    })

User-Defined Function (UDF)

**Tính Khoảng Cách Haversine**

The `haversine` formula is used to calculate the **geographical distance** between two points on the Earth's surface based on their longitude and latitude coordinates. This feature is added to the taxi trip duration prediction problem (`trip_duration`), as the travel time often depends on the actual distance between the pickup and drop-off locations.

**Haversine Formula**

The Haversine formula determines the distance between two points on a sphere.

Assumed that the Earth is a sphere with the mean radius $ R = 6371 \, \text{km} $, the distance between 2 locations (points) can be calculated as:

$$
a = \sin^2\left(\frac{\Delta \text{lat}}{2}\right) + \cos(\text{lat}_1) \cdot \cos(\text{lat}_2) \cdot \sin^2\left(\frac{\Delta \text{lon}}{2}\right)
$$ 

$$
\theta = 2 \cdot \text{atan2}\left(\sqrt{a}, \sqrt{1-a}\right)
$$

$$
d = R \cdot \theta
$$

Trong đó:
- $ \text{lon}_1, \text{lat}_1 $: Longitude and latitude of the pickup point.
- $ \text{lon}_2, \text{lat}_2 $: Longitude and latitude of the dropoff point.
- $ \Delta \text{lon} = \text{lon}_2 - \text{lon}_1 $: Longitude difference.
- $ \Delta \text{lat} = \text{lat}_2 - \text{lat}_1 $: Latitude difference.
- $ R = 6371 \, \text{km} $: Earth's radius.
- $ d $: Distance between pickup and dropoff points (in kilometer).
- $ a $: The "haversine" representation of the central angle between two points.
- $ \theta $: The central angle (in radian) between two points on the sphere. 
- $ d $: Multiply the central angle with Earth's radius to get the distance between pickup and dropoff points.

In [5]:
# Haversine distance function
def haversine(lon1, lat1, lon2, lat2):
    if None in (lon1, lat1, lon2, lat2) or not all(isinstance(x, (int, float)) for x in [lon1, lat1, lon2, lat2]):
        return 0.0
    R = 6371  # Earth's radius (km)
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

haversine_udf = udf(haversine, DoubleType())

Extract usable features:

In [6]:
#Get usable columns from the dataframe
#Also convert timestamps into time elements and encode `store_and_fwd_flag` feature into binary values
extracted_train_df = casted_train_data.selectExpr(
    'id',
    'vendor_id',
    'YEAR(pickup_datetime) AS pickup_year',
    'MONTH(pickup_datetime) AS pickup_month',
    'DAY(pickup_datetime) AS pickup_day',
    'HOUR(pickup_datetime) AS pickup_hour',
    'MINUTE(pickup_datetime) AS pickup_min',
    'SECOND(pickup_datetime) AS pickup_sec',
    'passenger_count',
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'CASE WHEN store_and_fwd_flag = "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag',
    'trip_duration'
).withColumn('distance_km', haversine_udf(col('pickup_longitude'), col('pickup_latitude'), col('dropoff_longitude'), col('dropoff_latitude')))

extracted_test_df = casted_test_data.selectExpr(
    'id',
    'vendor_id',
    'YEAR(pickup_datetime) AS pickup_year',
    'MONTH(pickup_datetime) AS pickup_month',
    'DAY(pickup_datetime) AS pickup_day',
    'HOUR(pickup_datetime) AS pickup_hour',
    'MINUTE(pickup_datetime) AS pickup_min',
    'SECOND(pickup_datetime) AS pickup_sec',
    'passenger_count',
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'CASE WHEN store_and_fwd_flag = "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag'
).withColumn('distance_km', haversine_udf(col('pickup_longitude'), col('pickup_latitude'), col('dropoff_longitude'), col('dropoff_latitude')))

In [7]:
# Removing outliers 
extracted_train_df = extracted_train_df.filter(
    (col('trip_duration') > 0) & 
    (col('trip_duration') < 36000) &  # Limited to 10 hours
    (col('passenger_count') >= 1) & 
    (col('passenger_count') <= 6) &  # Passenger limited
    (col('distance_km') > 0) & 
    (col('distance_km') < 100)  # Distance limited to 100km
)

# null value checking
print("Null value checking in extracted_train_df:")
for column in extracted_train_df.columns:
    null_count = extracted_train_df.filter(col(column).isNull()).count()
    print(f"{column}: {null_count} null value(s)")

extracted_train_df = extracted_train_df.na.drop()

Null value checking in extracted_train_df:


                                                                                

id: 0 null value(s)


                                                                                

vendor_id: 0 null value(s)


                                                                                

pickup_year: 0 null value(s)


                                                                                

pickup_month: 0 null value(s)


                                                                                

pickup_day: 0 null value(s)


                                                                                

pickup_hour: 0 null value(s)


                                                                                

pickup_min: 0 null value(s)


                                                                                

pickup_sec: 0 null value(s)
passenger_count: 0 null value(s)
pickup_longitude: 0 null value(s)
pickup_latitude: 0 null value(s)


                                                                                

dropoff_longitude: 0 null value(s)


                                                                                

dropoff_latitude: 0 null value(s)
store_and_fwd_flag: 0 null value(s)


                                                                                

trip_duration: 0 null value(s)




distance_km: 0 null value(s)


                                                                                

Encode the features into vector of values:

In [8]:
#Extract training features name

# List of features (excluding 'id' and 'trip_duration')
feature_cols = ['vendor_id', 'pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour',
                'pickup_min', 'pickup_sec', 'passenger_count', 'pickup_longitude',
                'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 
                'store_and_fwd_flag', 'distance_km']

#Use VectorAssembler to transform feature columns into a single vector
extracted_train_df.printSchema()
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Convert training data
train_data = assembler.transform(extracted_train_df).select('id', 'features', 'trip_duration')

root
 |-- id: string (nullable = true)
 |-- vendor_id: integer (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_day: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_min: integer (nullable = true)
 |-- pickup_sec: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: integer (nullable = false)
 |-- trip_duration: integer (nullable = true)
 |-- distance_km: double (nullable = true)



## 3. Model training

Split data

In [9]:
#Split data into training and testing sets

#Training set proportion parameter:
train_size = 0.8

train, validation = train_data.randomSplit([train_size, 1 - train_size], seed=24) #Fixed with seed for reproductivity

Fine-tuning

In [10]:
# Fine-tuning
param_grid = {
    'maxDepth': [5, 10, len(feature_cols) ] # try max_depth = 5, 10, and the number of features used (14)
}

best_rmse = float('inf')
best_params = None
best_model = None

estimator = DecisionTreeRegressor(
        featuresCol='features',
        labelCol='trip_duration',
    )

for maxDepth in param_grid['maxDepth']:
    estimator = estimator.setMaxDepth(maxDepth)
    model = estimator.fit(train)
    
    predictions = model.transform(validation)
    evaluator_rmse = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='rmse')
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Params: maxDepth={maxDepth}, RMSE={rmse}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = {'maxDepth': maxDepth}
        best_model = model

                                                                                

Params: maxDepth=5, RMSE=422.2581908402881


                                                                                

Params: maxDepth=10, RMSE=394.636938084777


25/04/11 23:19:41 WARN DAGScheduler: Broadcasting large task binary with size 1192.7 KiB
25/04/11 23:19:42 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
25/04/11 23:19:43 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB

Params: maxDepth=14, RMSE=416.2793441799063


25/04/11 23:19:45 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
                                                                                

## 4. Model evaluation (hold-out)

In [11]:
# Perform evaluation on the best parameters configuration found
predictions = best_model.transform(validation)
evaluator_rmse = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='rmse')
evaluator_r2 = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='r2')
rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Best Params: {best_params}")
print(f"Best RMSE on validation set: {rmse}")
print(f"Best R2 on validation set: {r2}")
print("Feature Importances:", best_model.featureImportances)
print("Tree structure:")
print(best_model.toDebugString)



Best Params: {'maxDepth': 10}
Best RMSE on validation set: 394.636938084777
Best R2 on validation set: 0.6683495988907258
Feature Importances: (14,[0,2,3,4,5,6,7,8,9,10,11,12,13],[0.00010897221014034543,0.006201867806923184,0.0006728950126734288,0.08531017405527125,0.0009732026940546361,0.0003900861862662318,1.202626011866105e-05,0.008086778422525951,0.008828652137905672,0.010361986830373173,0.022172552985344044,6.296936981486235e-06,0.856874508461422])
Tree structure:
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_ab3058b8ff7c, depth=10, numNodes=2001, numFeatures=14
  If (feature 13 <= 4.274704389596034)
   If (feature 13 <= 1.8601739559427597)
    If (feature 13 <= 1.1556561246742878)
     If (feature 13 <= 0.8848332239994345)
      If (feature 11 <= 40.76746368408203)
       If (feature 4 <= 7.5)
        If (feature 13 <= 0.6680815430163569)
         If (feature 10 <= -73.98264694213867)
          If (feature 4 <= 1.5)
           If (feature 6 <= 7.5)
            Predict: 3

                                                                                

## 5. Prediction (test file)

In [12]:
# Perform prediction on testing set
test_data = assembler.transform(extracted_test_df).select('id', 'features')
test_predictions = model.transform(test_data).select('id', 'prediction').withColumnRenamed('prediction', 'trip_duration')

# Write file
test_predictions.coalesce(1).write.csv('prediction_highAPI.csv', header=True, mode='overwrite')

# Display some records of prediction
test_predictions.show(5)

# Close SparkSession
spark.stop()

25/04/11 23:19:51 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/04/11 23:19:51 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/04/11 23:19:54 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


+---------+------------------+
|       id|     trip_duration|
+---------+------------------+
|id3004672| 775.1091424521616|
|id3505355| 647.6641221374045|
|id1217141|481.30968410150183|
|id2150126|1142.1054545454544|
|id1598245| 389.5066948555321|
+---------+------------------+
only showing top 5 rows

