In [2]:
from pyspark.sql import SparkSession

from pyspark.mllib.feature import LabeledPoint

from pyspark.mllib.tree import DecisionTree

In [None]:
#pyspark init
builder = SparkSession.Builder().appName('taxi_duration_mllib')
spark = builder.getOrCreate()

In [5]:
#Read input file
raw_data = spark.read.csv('train.csv', header=True, inferSchema=True)

                                                                                

In [6]:
#Cast the pickup and dropoff string values into timestamps.
casted_data = raw_data.withColumns({
                        'pickup_datetime' : raw_data['pickup_datetime'].cast('timestamp'),
                        'dropoff_datetime' : raw_data['dropoff_datetime'].cast('timestamp')
                    })

In [7]:
#Get usable columns from the dataframe
#Also convert pickup and dropoff timestamps into time elements and encode `store_and_fwd_flag` feature into binary values
extracted_df = casted_data.selectExpr(
                'YEAR(pickup_datetime)    AS pickup_year',
                'MONTH(pickup_datetime)   AS pickup_month',
                'DAY(pickup_datetime)     AS pickup_day',
                'HOUR(pickup_datetime)    AS pickup_hour',
                'MINUTE(pickup_datetime)  AS pickup_min',
                'SECOND(pickup_datetime)  AS pickup_sec',
                'YEAR(dropoff_datetime)   AS dropoff_year',
                'MONTH(dropoff_datetime)  AS dropoff_month',
                'DAY(dropoff_datetime)    AS dropoff_day',
                'HOUR(dropoff_datetime)   AS dropoff_hour',
                'MINUTE(dropoff_datetime) AS dropoff_min',
                'SECOND(dropoff_datetime) AS dropoff_sec',
                'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
                'CASE WHEN store_and_fwd_flag == "Y" THEN 1 ELSE 0 END AS store_and_fwd_flag',
                'trip_duration'
            )

In [8]:
#Convert to rdd
data = extracted_df.rdd.map(lambda row: LabeledPoint(label= row[-1], features= row[:-1]) )

In [11]:
#Split data into training and testing sets

###training set proportion parameter:
train_size = 0.8
###

train, test = data.randomSplit([train_size, 1 - train_size], seed=24) #Fixed with seed for reproductivity

In [15]:
model = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={})

                                                                                

In [16]:
predictions = model.predict(test.map(lambda row: row.features))

In [17]:
label_pred = test.map(lambda row: row.label).zip(predictions)

In [18]:
label_pred.first()

                                                                                

(663.0, 805.3417864433612)