# Decision Tree Regressor using Pyspark MLLib's

## 1. Preparation

### Importing necessary packages

In [None]:
from pyspark.sql import SparkSession

from pyspark.mllib.feature import LabeledPoint

from pyspark.mllib.tree import DecisionTree

Let's initialize a Spark session:

In [None]:
builder = SparkSession.Builder().appName('taxi_duration_mllib')
spark = builder.getOrCreate()

## 2.Data preprocessing

Read input datasets for training and testing:

In [None]:
#Read input files
raw_train_data = spark.read.csv('train.csv', header=True, inferSchema=True)
raw_test_data = spark.read.csv('test.csv', header=True, inferSchema=True)

                                                                                

Parse timestamp features in the dataset:

In [None]:
#Cast the pickup string values of training data into timestamps.
casted_train_data = raw_train_data.withColumns({
                        'pickup_datetime' : raw_train_data['pickup_datetime'].cast('timestamp'),
                    })

#Cast the pickup string values of testing data into timestamps.
casted_test_data = raw_test_data.withColumns({
                        'pickup_datetime' : raw_test_data['pickup_datetime'].cast('timestamp')
                    })

Extract usable features:

In [None]:
#Get usable columns from the dataframe
#Also convert timestamps into time elements and encode `store_and_fwd_flag` feature into binary values
extracted_train_df = casted_train_data.selectExpr(
        'vendor_id',
        'YEAR(pickup_datetime)    AS pickup_year',
        'MONTH(pickup_datetime)   AS pickup_month',
        'DAY(pickup_datetime)     AS pickup_day',
        'HOUR(pickup_datetime)    AS pickup_hour',
        'MINUTE(pickup_datetime)  AS pickup_min',
        'SECOND(pickup_datetime)  AS pickup_sec',
        'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
        'CASE WHEN store_and_fwd_flag == "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag',
        'trip_duration'
    )

extracted_test_df = casted_test_data.selectExpr(
        'vendor_id',
        'YEAR(pickup_datetime)    AS pickup_year',
        'MONTH(pickup_datetime)   AS pickup_month',
        'DAY(pickup_datetime)     AS pickup_day',
        'HOUR(pickup_datetime)    AS pickup_hour',
        'MINUTE(pickup_datetime)  AS pickup_min',
        'SECOND(pickup_datetime)  AS pickup_sec',
        'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
        'CASE WHEN store_and_fwd_flag == "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag',
    )

Then, convert the dataset into a RDD of `LabeledPoint` objects:

In [None]:
#Convert to rdd
train_data = extracted_train_df.rdd.map(lambda row: LabeledPoint(label= row[-1], features= row[:-1]) )
test_data = extracted_test_df.rdd.map(lambda row: LabeledPoint(label= float('-inf'), features= row[:-1]) )

## 3. Model training and testing

In [None]:
#Split data into training and testing sets

###training set proportion parameter:
train_size = 0.8
###

train, test = train_data.randomSplit([train_size, 1 - train_size], seed=24) #Fixed with seed for reproductivity

In [15]:
model = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={})

                                                                                

In [16]:
predictions = model.predict(test.map(lambda row: row.features))

In [17]:
label_pred = test.map(lambda row: row.label).zip(predictions)

In [18]:
label_pred.first()

                                                                                

(663.0, 805.3417864433612)