In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DropOffCoordinates").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/15 22:55:08 INFO SparkEnv: Registering MapOutputTracker
24/05/15 22:55:08 INFO SparkEnv: Registering BlockManagerMaster
24/05/15 22:55:08 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/05/15 22:55:08 INFO SparkEnv: Registering OutputCommitCoordinator


In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import col, unix_timestamp, dayofweek, hour, month, udf
from pyspark.sql.types import FloatType
import math

In [4]:
# Step 2: Load the data
data = spark.read.csv('gs://228bucket/processed_train_2.0.csv/processed_dataset_2.0.csv', header=True, inferSchema=True)



                                                                                

In [5]:
data.count()

                                                                                

42826493

In [16]:
df = data.sample(withReplacement=False, fraction=0.1, seed=44)

In [14]:
df.count()

                                                                                

4283040

In [17]:
df.show(2)

+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|        5.3|2010-07-07 14:52:00|      -73.969505|      40.784843|       -73.958732|       40.783357|              1|
|       10.5|2010-09-07 13:18:00|      -73.985382|      40.747858|       -73.978377|        40.76207|              1|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
only showing top 2 rows



In [19]:
from pyspark.sql.functions import col, split
from pyspark.sql.functions import from_unixtime, unix_timestamp, col
# split the time string into hours, minutes and seconds
df = df.withColumn('pickup_time', from_unixtime(unix_timestamp(col('pickup_datetime'), 'yyyy-MM-dd HH:mm:ss'), 'HH:mm:ss'))
time_split = split(col('pickup_time'), ':')

# Calculate the total number of seconds since midnight
seconds_from_midnight = (time_split.getItem(0).cast("int") * 3600) + (time_split.getItem(1).cast("int") * 60) + time_split.getItem(2).cast("int")

# Add new column to DataFrame
df = df.withColumn('seconds_from_midnight', seconds_from_midnight)

# Eigenvectors can now be created using new numeric columns
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['seconds_from_midnight', 'pickup_longitude', 'pickup_latitude', 'passenger_count'],
    outputCol='features'
)
df = assembler.transform(df)

df.show()

[Stage 24:>                                                         (0 + 1) / 1]

+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+---------------------+--------------------+
|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|pickup_time|seconds_from_midnight|            features|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+---------------------+--------------------+
|        5.3|2010-07-07 14:52:00|      -73.969505|      40.784843|       -73.958732|       40.783357|              1|   14:52:00|                53520|[53520.0,-73.9695...|
|       10.5|2010-09-07 13:18:00|      -73.985382|      40.747858|       -73.978377|        40.76207|              1|   13:18:00|                47880|[47880.0,-73.9853...|
|        7.3|2011-06-21 16:15:00|      -73.991875|      40.754437|        -73.97723|       40.774323|              3|   16:15:00|      

                                                                                

In [20]:
# Split the data into features and target variables
data = df.select('features', 'dropoff_longitude', 'dropoff_latitude')

# Step 5: Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [21]:
# Step 6: Train the model for longitude
rf_longitude = RandomForestRegressor(featuresCol='features', labelCol='dropoff_longitude', numTrees=10, seed=42)
model_longitude = rf_longitude.fit(train_data)

                                                                                

In [22]:
predictions_longitude = model_longitude.transform(test_data)
evaluator_longitude = RegressionEvaluator(labelCol='dropoff_longitude', predictionCol='prediction', metricName='rmse')
rmse_longitude = evaluator_longitude.evaluate(predictions_longitude)

                                                                                

In [23]:
print("Root Mean Squared Error (Longitude):", rmse_longitude)

Root Mean Squared Error (Longitude): 3.500806667538185


In [24]:
from pyspark.ml.regression import RandomForestRegressor
#save model
model_path_lg = "gs://228bucket/Models/lgfinal"
model_longitude.write().overwrite().save(model_path_lg)

                                                                                

In [25]:
# Train the model for latitude
rf_latitude = RandomForestRegressor(featuresCol='features', labelCol='dropoff_latitude', numTrees=10, seed=42)
model_latitude = rf_latitude.fit(train_data)

24/05/15 23:25:31 WARN BlockManager: Asked to remove block broadcast_71_piece0, which does not exist
                                                                                

In [26]:
# Predicting drop-off latitude
predictions_latitude = model_latitude.transform(test_data)
evaluator_latitude = RegressionEvaluator(labelCol='dropoff_latitude', predictionCol='prediction', metricName='rmse')
rmse_latitude = evaluator_latitude.evaluate(predictions_latitude)
print("Root Mean Squared Error (Latitude):", rmse_latitude)



Root Mean Squared Error (Latitude): 2.5666384925377153


                                                                                

In [27]:
#save model
model_path_lat = "gs://228bucket/Models/latfinal"
model_latitude.write().overwrite().save(model_path_lat)

                                                                                

In [None]:
# Stop the Spark session
spark.stop()