# Decision Tree Regressor using Pyspark MLLib's

Importing necessary packages

In [1]:
from pyspark.sql import SparkSession
from pyspark.mllib.feature import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql.functions import col, udf
from math import radians, sin, cos, sqrt, atan2
from pyspark.sql.types import DoubleType

## 1. Preparation

Let's initialize a Spark session:

In [2]:
builder = SparkSession.Builder().appName('taxi_duration_mllib')
spark = builder.getOrCreate()

your 131072x1 screen size is bogus. expect trouble
25/04/11 20:12:29 WARN Utils: Your hostname, HP-Envy resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/11 20:12:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 20:12:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Read input datasets for training and testing:

In [3]:
#Read input files
raw_train_data = spark.read.csv('train.csv', header=True, inferSchema=True)
raw_test_data = spark.read.csv('test.csv', header=True, inferSchema=True)

                                                                                

## 2.Data preprocessing

Parse timestamp features in the dataset:

In [4]:
#Cast the pickup string values of training data into timestamps.
casted_train_data = raw_train_data.withColumns({
                        'pickup_datetime' : raw_train_data['pickup_datetime'].cast('timestamp'),
                    })

#Cast the pickup string values of testing data into timestamps.
casted_test_data = raw_test_data.withColumns({
                        'pickup_datetime' : raw_test_data['pickup_datetime'].cast('timestamp')
                    })

User-Defined Function (UDF)

In [5]:
# Hàm tính khoảng cách Haversine
def haversine(lon1, lat1, lon2, lat2):
    if None in (lon1, lat1, lon2, lat2) or not all(isinstance(x, (int, float)) for x in [lon1, lat1, lon2, lat2]):
        return 0.0
    R = 6371  # Bán kính Trái Đất (km)
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

haversine_udf = udf(haversine, DoubleType())

Extract usable features:

In [6]:
#Get usable columns from the dataframe
#Also convert timestamps into time elements and encode `store_and_fwd_flag` feature into binary values
extracted_train_df = casted_train_data.selectExpr(
        'id',
        'vendor_id',
        'YEAR(pickup_datetime)    AS pickup_year',
        'MONTH(pickup_datetime)   AS pickup_month',
        'DAY(pickup_datetime)     AS pickup_day',
        'HOUR(pickup_datetime)    AS pickup_hour',
        'MINUTE(pickup_datetime)  AS pickup_min',
        'SECOND(pickup_datetime)  AS pickup_sec',
        'passenger_count', 
        'pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude',
        'CASE WHEN store_and_fwd_flag == "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag',
        'trip_duration'
    ).withColumn('distance_km', haversine_udf(col('pickup_longitude'), col('pickup_latitude'), col('dropoff_longitude'), col('dropoff_latitude')))

extracted_test_df = casted_test_data.selectExpr(
        'id',
        'vendor_id',
        'YEAR(pickup_datetime)    AS pickup_year',
        'MONTH(pickup_datetime)   AS pickup_month',
        'DAY(pickup_datetime)     AS pickup_day',
        'HOUR(pickup_datetime)    AS pickup_hour',
        'MINUTE(pickup_datetime)  AS pickup_min',
        'SECOND(pickup_datetime)  AS pickup_sec',
        'passenger_count', 
        'pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude',
        'CASE WHEN store_and_fwd_flag == "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag',
    ).withColumn('distance_km', haversine_udf(col('pickup_longitude'), col('pickup_latitude'), col('dropoff_longitude'), col('dropoff_latitude')))

In [7]:
# Loại bỏ outliers
extracted_train_df = extracted_train_df.filter(
    (col('trip_duration') > 0) & 
    (col('trip_duration') < 36000) &  # Giới hạn 10 giờ
    (col('passenger_count') >= 1) & 
    (col('passenger_count') <= 6) &  # Giới hạn hành khách
    (col('distance_km') > 0) & 
    (col('distance_km') < 100)  # Giới hạn khoảng cách 100km
)

# Kiểm tra giá trị null
print("Kiểm tra giá trị null trong extracted_train_df:")
for column in extracted_train_df.columns:
    null_count = extracted_train_df.filter(col(column).isNull()).count()
    print(f"{column}: {null_count} giá trị null")

extracted_train_df = extracted_train_df.na.drop()

Kiểm tra giá trị null trong extracted_train_df:


                                                                                

id: 0 giá trị null


                                                                                

vendor_id: 0 giá trị null


                                                                                

pickup_year: 0 giá trị null


                                                                                

pickup_month: 0 giá trị null


                                                                                

pickup_day: 0 giá trị null


                                                                                

pickup_hour: 0 giá trị null


                                                                                

pickup_min: 0 giá trị null


                                                                                

pickup_sec: 0 giá trị null


                                                                                

passenger_count: 0 giá trị null


                                                                                

pickup_longitude: 0 giá trị null


                                                                                

pickup_latitude: 0 giá trị null


                                                                                

dropoff_longitude: 0 giá trị null


                                                                                

dropoff_latitude: 0 giá trị null
store_and_fwd_flag: 0 giá trị null


                                                                                

trip_duration: 0 giá trị null


[Stage 48:===>                                                    (1 + 15) / 16]

distance_km: 0 giá trị null


                                                                                

Then, convert the dataset into a RDD of `LabeledPoint` objects:

In [8]:
# Danh sách đặc trưng (giống 3.2.1)
feature_cols = ['vendor_id', 'pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour',
                'pickup_min', 'pickup_sec', 'passenger_count', 'pickup_longitude',
                'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 
                'store_and_fwd_flag', 'distance_km']

#Convert to rdd
# train_data = extracted_train_df.rdd.map(lambda row: LabeledPoint(label= row[-1], features= row[:-1]) )
# test_data = extracted_test_df.rdd.map(lambda row: LabeledPoint(label= float('-inf'), features= row[:-1]) )
def to_labeled_point(row):
    features = [float(row[col]) for col in feature_cols]
    # Ánh xạ vendor_id: 1 -> 0, 2 -> 1
    features[0] = features[0] - 1  # vendor_id ở index 0
    return LabeledPoint(row['trip_duration'], features)

train_data = extracted_train_df.rdd.map(to_labeled_point)

# Chuyển đổi tập test
def to_features_test(row):
    features = [float(row[col]) for col in feature_cols]
    features[0] = features[0] - 1  # Ánh xạ vendor_id: 1 -> 0, 2 -> 1
    return features

test_features = extracted_test_df.rdd.map(to_features_test)
test_ids = extracted_test_df.rdd.map(lambda row: row['id'])

## 3. Model training

Split data

In [9]:
# Split data into training and testing sets
# Training set proportion parameter:
train_size = 0.8

train_rdd, validation_rdd = train_data.randomSplit([train_size, 1 - train_size], seed=24) #Fixed with seed for reproductivity

# Cache RDD (Lưu train_rdd và validation_rdd vào bộ nhớ để giảm tải)
train_rdd.cache()
validation_rdd.cache()

PythonRDD[197] at RDD at PythonRDD.scala:53

Fine-tuning

In [10]:
# model = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={})

# Sử dụng tham số tối ưu từ 3.2.1
best_params = {'maxDepth': 10, 'maxBins': 64, 'minInstancesPerNode': 5}

# Huấn luyện mô hình
model = DecisionTree.trainRegressor(
    train_rdd,
    categoricalFeaturesInfo={0: 2, 7: 10, 12: 2},  # vendor_id, passenger_count, store_and_fwd_flag
    impurity='variance',
    maxDepth=best_params['maxDepth'],
    maxBins=best_params['maxBins'],
    minInstancesPerNode=best_params['minInstancesPerNode']
)

25/04/11 20:13:26 WARN BlockManager: Task 290 already completed, not releasing lock for rdd_196_0
                                                                                

In [11]:
#predictions = model.predict(test.map(lambda row: row.features))

In [12]:
#label_pred = test.map(lambda row: row.label).zip(predictions)

In [13]:
#label_pred.first()

## 4. Model evaluation (hold-out)

In [14]:
# Đánh giá trên validation
predictions = model.predict(validation_rdd.map(lambda lp: lp.features))
labels_and_preds = validation_rdd.map(lambda lp: lp.label).zip(predictions)
metrics = RegressionMetrics(labels_and_preds)
rmse = metrics.rootMeanSquaredError
r2 = metrics.r2

print(f"RMSE trên validation (MLlib RDD): {rmse}")
print(f"R2 trên validation (MLlib RDD): {r2}")



RMSE trên validation (MLlib RDD): 394.66337924115203
R2 trên validation (MLlib RDD): 0.5075514363493248


                                                                                

## 5. Comparison with Structured API

In [15]:
structured_api_rmse = 382.92100639699294 # Số từ 3.2.1
structured_api_r2 =  0.68297963107283  # Số từ 3.2.1
print(f"Structured API RMSE: {structured_api_rmse}")
print(f"Structured API R2: {structured_api_r2}")
print(f"Chênh lệch RMSE: {rmse - structured_api_rmse}")
print(f"Chênh lệch R2: {r2 - structured_api_r2}")

Structured API RMSE: 382.92100639699294
Structured API R2: 0.68297963107283
Chênh lệch RMSE: 11.742372844159092
Chênh lệch R2: -0.17542819472350524


## 6. Prediction (test file)

In [16]:
# Dự đoán trên test
test_predictions = model.predict(test_features)
test_predictions_df = spark.createDataFrame(
    test_ids.zip(test_predictions),
    schema=['id', 'trip_duration']
)

# Write file
test_predictions_df.coalesce(1).write.csv("prediction_mllib.csv", header=True, mode='overwrite')

test_predictions_df.show(5)

# Đóng SparkSession
spark.stop()

                                                                                

+---------+------------------+
|       id|     trip_duration|
+---------+------------------+
|id3004672| 579.4204229607251|
|id3505355| 622.9761684987833|
|id1217141|473.06607351225205|
|id2150126|1273.8461140566403|
|id1598245| 343.8123226591001|
+---------+------------------+
only showing top 5 rows

