In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.ml.regression import GBTRegressionModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.sql.functions import col, unix_timestamp, dayofweek, hour, month, udf
from pyspark.sql.types import FloatType
import math

In [2]:
# Create SparkSession
spark = SparkSession.builder.appName("Demo7").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/16 03:06:59 INFO SparkEnv: Registering MapOutputTracker
24/05/16 03:06:59 INFO SparkEnv: Registering BlockManagerMaster
24/05/16 03:06:59 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/05/16 03:06:59 INFO SparkEnv: Registering OutputCommitCoordinator


In [3]:
# Load the data from CSV
data = spark.read.csv('gs://228bucket/processed_train_2.0.csv/processed_dataset_2.0.csv', header=True, inferSchema=True)

# Create test data by sampling from the original data
test_data = data.sample(withReplacement=False, fraction=0.10)

# Show some information about the test data
print("Test Data Schema:")
test_data.printSchema()

print("Number of Rows in Test Data:", test_data.count())

                                                                                

Test Data Schema:
root
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)





Number of Rows in Test Data: 4284701


                                                                                

In [5]:
# Save test data to CSV to create test demo data
test_data.write.csv('gs://228bucket/test_demo_data1.csv', header=True)

                                                                                

In [6]:
# Load the data from CSV
test_data = spark.read.csv('gs://228bucket/test_demo_data1.csv', header=True, inferSchema=True)
test_data.show(5)

                                                                                

+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|
+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|       12.0|2014-07-10 11:04:00|      -73.995332|      40.739397|       -73.969153|        40.76269|              1|
|        7.3|2012-06-02 00:10:29|      -73.988829|      40.749249|        -74.00024|       40.728748|              4|
|        4.1|2011-06-21 18:45:00|       -74.00273|      40.749548|       -73.997283|       40.757093|              1|
|       14.5|2010-01-31 19:50:59|      -73.949526|      40.772658|       -73.991168|       40.748733|              1|
|       12.1|2011-01-22 02:13:00|      -73.963807|      40.710655|       -73.960502|       40.675922|              2|
+-----------+-------------------+----------------+------

In [7]:
 def calculate_distance(lat1, lon1, lat2, lon2):
        if lat1 == lat2 and lon1 == lon2:
            return 0.0
        else:
            R = 6371.0  # Radius of the Earth in kilometers
            lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])  # Convert degrees to radians
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
            c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
            distance = R * c
            return float(distance)

In [8]:
def process_data(test_data):
    # Convert datetime and extract features
    test_data = test_data.withColumn("pickup_datetime", unix_timestamp(col("pickup_datetime"), "yyyy-MM-dd'T'HH:mm:ss.SSSX").cast("timestamp"))
    test_data = test_data.withColumn("pickup_hour", hour(col("pickup_datetime")))
    test_data = test_data.withColumn("day_of_week", dayofweek(col("pickup_datetime")))
    test_data = test_data.withColumn("month", month(col("pickup_datetime")))

    udf_calculate_distance = udf(calculate_distance, FloatType())
    test_data = test_data.withColumn("distance", udf_calculate_distance(col("pickup_latitude"), col("pickup_longitude"), col("dropoff_latitude"), col("dropoff_longitude")))
    
    feature_columns = ["passenger_count", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "distance", "pickup_hour", "day_of_week", "month"]
    
    # Assemble the feature columns into a single vector column
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    test_data = assembler.transform(test_data)
    
    # Define the MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Fit the MinMaxScaler to the training data
    scaler_model = scaler.fit(test_data)

    # Transform the training data
    test_data_scaled = scaler_model.transform(test_data)
    return test_data_scaled

scaled_test_data = process_data(test_data)

                                                                                

In [9]:
## Random forest

In [11]:
# Load RandomForestRegressor model
rf_model_path = "gs://228bucket/Models/rf_model"
rf_model = RandomForestRegressionModel.load(rf_model_path)

# Make predictions using RandomForestRegressor
rf_predictions = rf_model.transform(scaled_test_data)

print("RandomForestRegressor Predictions:")
rf_predictions.show(5)

                                                                                

RandomForestRegressor Predictions:


[Stage 28:>                                                         (0 + 1) / 1]

+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+-----------+-----+---------+--------------------+--------------------+------------------+
|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|pickup_hour|day_of_week|month| distance|            features|      scaledFeatures|        prediction|
+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+-----------+-----+---------+--------------------+--------------------+------------------+
|2014-07-10 11:04:00|      -73.995332|      40.739397|       -73.969153|        40.76269|              1|         11|          5|    7| 3.401677|[1.0,-73.995332,4...|[0.0,0.3171129185...|17.883390490285027|
|2012-06-02 00:10:29|      -73.988829|      40.749249|        -74.00024|       40.728748|              4|          0|          7|    6| 2.474042|[4.0,-73.988829,4...|[0.014

                                                                                

In [14]:
rf_predictions.show(20)

+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+-----------+-----+---------+--------------------+--------------------+------------------+
|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|pickup_hour|day_of_week|month| distance|            features|      scaledFeatures|        prediction|
+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+-----------+-----+---------+--------------------+--------------------+------------------+
|2014-07-10 11:04:00|      -73.995332|      40.739397|       -73.969153|        40.76269|              1|         11|          5|    7| 3.401677|[1.0,-73.995332,4...|[0.0,0.3171129185...|17.883390490285027|
|2012-06-02 00:10:29|      -73.988829|      40.749249|        -74.00024|       40.728748|              4|          0|          7|    6| 2.474042|[4.0,-73.988829,4...|[0.014

[Stage 38:>                                                         (0 + 1) / 1]                                                                                

In [12]:
## GBT Model

In [13]:
# Load GBTRegressor model
gbt_model_path = "gs://228bucket/Models/gbt_model"
gbt_model = GBTRegressionModel.load(gbt_model_path)

# Make predictions using GBTRegressor
gbt_predictions = gbt_model.transform(scaled_test_data)

# Show predictions
print("GBTRegressor Predictions:")
gbt_predictions.show(5)

                                                                                

GBTRegressor Predictions:
+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+-----------+-----+---------+--------------------+--------------------+------------------+
|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|pickup_hour|day_of_week|month| distance|            features|      scaledFeatures|        prediction|
+-------------------+----------------+---------------+-----------------+----------------+---------------+-----------+-----------+-----+---------+--------------------+--------------------+------------------+
|2014-07-10 11:04:00|      -73.995332|      40.739397|       -73.969153|        40.76269|              1|         11|          5|    7| 3.401677|[1.0,-73.995332,4...|[0.0,0.3171129185...|  36.9656465475079|
|2012-06-02 00:10:29|      -73.988829|      40.749249|        -74.00024|       40.728748|              4|          0|          7|    6| 2.474042|[

[Stage 37:>                                                         (0 + 1) / 1]                                                                                