In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressionModel

spark = SparkSession.builder.appName("DropOffCoordinatesDemo").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/16 01:58:08 INFO SparkEnv: Registering MapOutputTracker
24/05/16 01:58:09 INFO SparkEnv: Registering BlockManagerMaster
24/05/16 01:58:09 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/05/16 01:58:09 INFO SparkEnv: Registering OutputCommitCoordinator


In [3]:
data = spark.read.csv('gs://228bucket/processed_train_2.0.csv/processed_dataset_2.0.csv', header=True, inferSchema=True)

# Create test data by sampling from the original data
test_data = data.sample(withReplacement=False, fraction=0.0001)

# Show some information about the test data
print("Test Data Schema:")
test_data.printSchema()

print("Number of Rows in Test Data:", test_data.count())

                                                                                

Test Data Schema:
root
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)





Number of Rows in Test Data: 4312


                                                                                

In [5]:
test_data.write.csv('gs://228bucket/test_2demo_data.csv', header=True)

                                                                                

In [6]:
test_data = spark.read.csv('gs://228bucket/test_2demo_data.csv', header=True, inferSchema=True)
test_data.show(5)

                                                                                

+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|       12.9|2011-01-29 01:50:07|             0.0|            0.0|              0.0|             0.0|              1|
|       11.7|2011-07-28 00:08:28|      -73.989378|      40.741525|       -73.955272|       40.776918|              1|
|       12.1|2010-01-21 04:17:34|      -73.988655|      40.758316|        -74.01221|       40.707129|              1|
|        8.5|2013-03-30 08:58:00|       -73.95023|       40.77969|       -73.968578|       40.761727|              1|
|        5.0|2014-12-20 19:30:00|      -73.991202|      40.750822|       -73.984685|       40.742962|              1|
+-----------+-------------------+----------------+------

In [7]:
from pyspark.sql.functions import col, split
from pyspark.sql.functions import from_unixtime, unix_timestamp, col
# split the time string into hours, minutes and seconds
test_data = test_data.withColumn('pickup_time', from_unixtime(unix_timestamp(col('pickup_datetime'), 'yyyy-MM-dd HH:mm:ss'), 'HH:mm:ss'))
time_split = split(col('pickup_time'), ':')

# Calculate the total number of seconds since midnight
seconds_from_midnight = (time_split.getItem(0).cast("int") * 3600) + (time_split.getItem(1).cast("int") * 60) + time_split.getItem(2).cast("int")

# Add new column to DataFrame
test_data = test_data.withColumn('seconds_from_midnight', seconds_from_midnight)

# Eigenvectors can now be created using new numeric columns
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['seconds_from_midnight', 'pickup_longitude', 'pickup_latitude', 'passenger_count'],
    outputCol='features'
)
test_data = assembler.transform(test_data)
test_data.show(5)

+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+---------------------+--------------------+
|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|pickup_time|seconds_from_midnight|            features|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+---------------------+--------------------+
|       12.9|2011-01-29 01:50:07|             0.0|            0.0|              0.0|             0.0|              1|   01:50:07|                 6607|[6607.0,0.0,0.0,1.0]|
|       11.7|2011-07-28 00:08:28|      -73.989378|      40.741525|       -73.955272|       40.776918|              1|   00:08:28|                  508|[508.0,-73.989378...|
|       12.1|2010-01-21 04:17:34|      -73.988655|      40.758316|        -74.01221|       40.707129|              1|   04:17:34|      

[Stage 11:>                                                         (0 + 1) / 1]                                                                                

In [8]:
# Load RandomForestRegressor model
latfinal_path = "gs://228bucket/Models/latfinal"
lgfinal_path ="gs://228bucket/Models/lgfinal"
latfinal = RandomForestRegressionModel.load(latfinal_path)
lgfinal = RandomForestRegressionModel.load(lgfinal_path)
# Make predictions using RandomForestRegressor
latfinal_predictions = latfinal.transform(test_data)
lgfinal_predictions = lgfinal.transform(test_data)

print("RandomForestRegressor Latitude Predictions:")
latfinal_predictions.show(5)
print("RandomForestRegressor Longitude Predictions:")
lgfinal_predictions.show(5)



RandomForestRegressor Latitude Predictions:


                                                                                

+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+---------------------+--------------------+------------------+
|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|pickup_time|seconds_from_midnight|            features|        prediction|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+---------------------+--------------------+------------------+
|       12.9|2011-01-29 01:50:07|             0.0|            0.0|              0.0|             0.0|              1|   01:50:07|                 6607|[6607.0,0.0,0.0,1.0]|0.1648562386645145|
|       11.7|2011-07-28 00:08:28|      -73.989378|      40.741525|       -73.955272|       40.776918|              1|   00:08:28|                  508|[508.0,-73.989378...| 40.71143871567419|
|       12.1|2010-01-21 04:17:34|      -