In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, unix_timestamp, hour, dayofweek, month, udf, when
from pyspark.sql.types import FloatType
import math

# Initialize Spark Session
spark = SparkSession.builder.appName("Taxi Fare Prediction").getOrCreate()


data = spark.read.csv('gs://228bucket/processed_train_2.0.csv/processed_dataset_2.0.csv', header=True, inferSchema=True)


# Convert datetime and extract features
data = data.withColumn("pickup_datetime", unix_timestamp(col("pickup_datetime"), "yyyy-MM-dd'T'HH:mm:ss.SSSX").cast("timestamp"))
data = data.withColumn("pickup_hour", hour(col("pickup_datetime")))
data = data.withColumn("day_of_week", dayofweek(col("pickup_datetime")))
data = data.withColumn("month", month(col("pickup_datetime")))

# Define the distance calculation function
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return float(R * c)

udf_calculate_distance = udf(calculate_distance, FloatType())
data = data.withColumn("distance", udf_calculate_distance(col("pickup_latitude"), col("pickup_longitude"), col("dropoff_latitude"), col("dropoff_longitude")))

# Create a new column 'is_weekend' where 1 if the day is Saturday (7) or Sunday (1), otherwise 0
data = data.withColumn("is_weekend", when(col("day_of_week").isin([7, 1]), 1).otherwise(0))

# One-hot encoding for 'day_of_week'
day_of_week_indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_index")
day_of_week_encoder = OneHotEncoder(inputCols=["day_of_week_index"], outputCols=["day_of_week_encoded"])

24/05/15 16:18:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [3]:
# Register the DataFrame as a SQL temporary view
data.createOrReplaceTempView("taxi_data")


In [4]:
# Print the schema of the DataFrame
spark.sql("DESCRIBE taxi_data").show()

# Display summary statistics for all numerical columns
spark.sql("DESCRIBE TABLE EXTENDED taxi_data").show()


+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|      fare_amount|   double|   NULL|
|  pickup_datetime|timestamp|   NULL|
| pickup_longitude|   double|   NULL|
|  pickup_latitude|   double|   NULL|
|dropoff_longitude|   double|   NULL|
| dropoff_latitude|   double|   NULL|
|  passenger_count|      int|   NULL|
|      pickup_hour|      int|   NULL|
|      day_of_week|      int|   NULL|
|            month|      int|   NULL|
|         distance|    float|   NULL|
|       is_weekend|      int|   NULL|
+-----------------+---------+-------+

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|      fare_amount|   double|   NULL|
|  pickup_datetime|timestamp|   NULL|
| pickup_longitude|   double|   NULL|
|  pickup_latitude|   double|   NULL|
|dropoff_longitude|   double|   NULL|
| dropoff_latitude|   double|   NULL|
|  passenger_count|      int|   NULL|
|      pick

## Distribution of Numerical Features

In [5]:
# Get summary statistics for 'distance' and 'fare_amount'
spark.sql("SELECT MIN(distance), MAX(distance), AVG(distance), STDDEV(distance) FROM taxi_data").show()
spark.sql("SELECT MIN(fare_amount), MAX(fare_amount), AVG(fare_amount), STDDEV(fare_amount) FROM taxi_data").show()


                                                                                

+-------------+-------------+------------------+------------------+
|min(distance)|max(distance)|     avg(distance)|  stddev(distance)|
+-------------+-------------+------------------+------------------+
|          0.0|      19688.8|20.420034621872315|375.50659528892106|
+-------------+-------------+------------------+------------------+





+----------------+----------------+------------------+-------------------+
|min(fare_amount)|max(fare_amount)|  avg(fare_amount)|stddev(fare_amount)|
+----------------+----------------+------------------+-------------------+
|             0.0|          1564.5|11.460607645597454|   9.89161559220771|
+----------------+----------------+------------------+-------------------+



                                                                                

In [6]:
## Weekend vs. Weekday Analysis

In [7]:
# Statistics for weekends and weekdays
spark.sql("""
SELECT 
    is_weekend,
    COUNT(*) as total_rides,
    AVG(fare_amount) as average_fare,
    AVG(distance) as average_distance,
    MIN(fare_amount) as min_fare,
    MAX(fare_amount) as max_fare,
    MIN(distance) as min_distance,
    MAX(distance) as max_distance
FROM 
    taxi_data
GROUP BY 
    is_weekend
ORDER BY 
    is_weekend
""").show()




+----------+-----------+------------------+------------------+--------+--------+------------+------------+
|is_weekend|total_rides|      average_fare|  average_distance|min_fare|max_fare|min_distance|max_distance|
+----------+-----------+------------------+------------------+--------+--------+------------+------------+
|         0|   30688693| 11.48504679036261|20.751315419639518|     0.0|  1021.3|         0.0|   14010.494|
|         1|   12137800|11.398816760038915| 19.58243845951397|     0.0|  1564.5|         0.0|     19688.8|
+----------+-----------+------------------+------------------+--------+--------+------------+------------+



                                                                                

In [8]:
## Detailed Day Analysis

In [9]:
# Ride statistics by day of the week
spark.sql("""
SELECT 
    day_of_week,
    COUNT(*) as total_rides,
    AVG(fare_amount) as average_fare,
    AVG(distance) as average_distance,
    MIN(fare_amount) as min_fare,
    MAX(fare_amount) as max_fare,
    MIN(distance) as min_distance,
    MAX(distance) as max_distance
FROM 
    taxi_data
GROUP BY 
    day_of_week
ORDER BY 
    day_of_week
""").show()




+-----------+-----------+------------------+------------------+--------+--------+------------+------------+
|day_of_week|total_rides|      average_fare|  average_distance|min_fare|max_fare|min_distance|max_distance|
+-----------+-----------+------------------+------------------+--------+--------+------------+------------+
|          1|    5639700|11.746857809813179|19.803206929371882|     0.0|   500.0|         0.0|   12594.705|
|          2|    5517934|11.496004223683046|20.841823721019924|     0.0|   500.0|         0.0|   10833.611|
|          3|    5998528| 11.30810133086045|20.671720521808535|     0.0|  1021.3|         0.0|   13642.391|
|          4|    6229111|11.430351493175094|20.820634117768762|     0.0|  651.07|         0.0|   14010.494|
|          5|    6380151|11.632963298204645|20.935260657605856|     0.0|   900.0|         0.0|   10003.499|
|          6|    6562969|11.545678391899033|20.503354646222494|     0.0|   668.5|         0.0|   13912.815|
|          7|    6498100|11.

[Stage 64:>                                                         (0 + 1) / 1]                                                                                

In [10]:
## Monthly Trends

In [11]:
# Monthly ride statistics
spark.sql("""
SELECT 
    month,
    COUNT(*) as total_rides,
    AVG(fare_amount) as average_fare,
    AVG(distance) as average_distance
FROM 
    taxi_data
GROUP BY 
    month
ORDER BY 
    month
""").show()




+-----+-----------+------------------+------------------+
|month|total_rides|      average_fare|  average_distance|
+-----+-----------+------------------+------------------+
|    1|    3572234|10.712232205393574|16.185484993761705|
|    2|    3349804|10.821751397992696|19.558332194151696|
|    3|    3789413|11.109660622899193| 19.25480646131086|
|    4|    3725183|11.243589198168834|19.585699783740385|
|    5|    3820850|11.558851347214029|20.038573508940722|
|    6|    3645118|11.533138408688615| 21.33160607122314|
|    7|    3505655|11.320181780580763|22.856694370016363|
|    8|    3254499| 11.46909976620101|25.027901898194685|
|    9|    3504697|12.026139945907673|20.717392173954508|
|   10|    3672239|11.962919317071464|19.889687187285663|
|   11|    3480154|11.857780172947088|20.245097873822775|
|   12|    3506647|11.907693429079835|  20.8898482782378|
+-----+-----------+------------------+------------------+



[Stage 67:>                                                         (0 + 1) / 1]                                                                                

In [12]:
## Average Distance Traveled per Month

In [13]:
# Average distance traveled per month
spark.sql("""
SELECT 
    month,
    AVG(distance) as average_distance
FROM 
    taxi_data
GROUP BY 
    month
ORDER BY 
    month
""").show()




+-----+------------------+
|month|  average_distance|
+-----+------------------+
|    1|  16.1854849937617|
|    2|19.558332194151692|
|    3|19.254806461310864|
|    4|19.585699783740377|
|    5|20.038573508940715|
|    6|21.331606071223135|
|    7|22.856694370016374|
|    8|25.027901898194692|
|    9| 20.71739217395451|
|   10|19.889687187285663|
|   11|20.245097873822772|
|   12|  20.8898482782378|
+-----+------------------+



24/05/15 16:33:38 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /10.128.0.5:60950 is closed
24/05/15 16:33:38 WARN BlockManagerMasterEndpoint: Error trying to remove broadcast 74 from block manager BlockManagerId(18, cluster-789c-w-0.us-central1-f.c.vertical-kayak-423108-t5.internal, 43387, None)
java.io.IOException: Connection from /10.128.0.5:60950 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:147) ~[spark-network-common_2.12-3.5.0.jar:3.5.0]
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117) ~[spark-network-common_2.12-3.5.0.jar:3.5.0]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:305) ~[netty-transport-4.1.100.Final.jar:4.1.100.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:281) ~[netty-tr

In [14]:
## Average Distance Traveled per Week

In [15]:
# Assuming 'pickup_datetime' can be used to extract week of the year
spark.sql("""
SELECT 
    weekofyear(pickup_datetime) as week_of_year,
    AVG(distance) as average_distance
FROM 
    taxi_data
GROUP BY 
    week_of_year
ORDER BY 
    week_of_year
""").show()




+------------+------------------+
|week_of_year|  average_distance|
+------------+------------------+
|           1| 16.69180027085385|
|           2| 16.50167219568823|
|           3|15.660032904287828|
|           4|16.484110867833095|
|           5|19.107593688076978|
|           6| 19.65414048830177|
|           7|19.643598056514982|
|           8| 19.96502899361394|
|           9|20.124095467551744|
|          10|19.188544097616877|
|          11| 18.95373901160854|
|          12|18.531580179701585|
|          13|19.437887565315375|
|          14|19.194391544324226|
|          15| 19.33432891900656|
|          16| 19.64735716401732|
|          17|19.403530474292314|
|          18|19.173139839411043|
|          19| 19.90196610267479|
|          20| 20.06397784067553|
+------------+------------------+
only showing top 20 rows



                                                                                