In [1]:
from pyspark.sql import SparkSession 

In [3]:
spark = SparkSession.builder.appName("taxi_trips").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/28 01:54:45 INFO SparkEnv: Registering MapOutputTracker
24/03/28 01:54:45 INFO SparkEnv: Registering BlockManagerMaster
24/03/28 01:54:45 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/03/28 01:54:45 INFO SparkEnv: Registering OutputCommitCoordinator


 Read the CSV File called "chicago_taxi_trips.csv" into a dataframe after uploading it to a GCS Bucket

In [4]:
df = spark.read.csv("gs://mypyspark-jm/chicago_taxi_trips.csv",header=True, inferSchema=True)

                                                                                

In [5]:
df.show(10)

+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
|             taxi_id|    trip_start_time|      trip_end_time|payment_type|trip_miles| fare|tips|trip_total|
+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
|4c64024caa0fc8132...|2023-03-01 00:00:00|2023-03-01 00:15:00|        Cash|       0.1| 11.0| 0.0|      11.0|
|9b934e34b3c5c2b3e...|2023-03-01 00:00:00|2023-03-01 00:15:00|     Unknown|       0.0|20.75| 0.0|     20.75|
|93928e06ae0917b94...|2023-03-01 00:00:00|2023-03-01 00:15:00| Credit Card|       7.2| 19.5|10.0|      34.0|
|ccbebd87160b9da51...|2023-03-01 00:00:00|2023-03-01 00:30:00|        Cash|      6.86| 22.5| 0.0|      27.5|
|ef28bdc517ee8c312...|2023-03-01 00:00:00|2023-03-01 00:30:00| Credit Card|     15.64| 39.0| 8.7|      52.2|
|51ffa6b1253b4c904...|2023-03-01 00:00:00|2023-03-01 00:00:00| Credit Card|       0.0| 3.25|2.25|      13.0|
|735d1e4b889747f5f.

In [6]:
from pyspark.sql.functions import min,max 

Get the minimum and maximum "fare" by each payment_type 

In [7]:
fare_stats = df.groupBy("payment_type").agg(min("fare").alias("min_fare"),
                                           max("fare").alias("max_fare"))

In [9]:
fare_stats.show()

+------------+--------+--------+
|payment_type|min_fare|max_fare|
+------------+--------+--------+
| Credit Card|    3.25|   135.0|
|      Mobile|    4.25|   54.28|
|      Prcard|    3.25|    58.0|
|        Cash|     0.0|   275.5|
|     Dispute|    4.25|    46.0|
|     Unknown|    4.25|   46.75|
|   No Charge|    10.0|    33.5|
+------------+--------+--------+



Convert the column trip_start_time and trip_end_time to TimestampType 

In [11]:
from pyspark.sql.functions import to_timestamp 
from pyspark.sql.types import TimestampType


In [12]:
df = df.withColumn("trip_start_time", to_timestamp("trip_start_time").cast(TimestampType()))
df = df.withColumn("trip_end_time", to_timestamp("trip_end_time").cast(TimestampType()))


In [13]:
df.printSchema()

root
 |-- taxi_id: string (nullable = true)
 |-- trip_start_time: timestamp (nullable = true)
 |-- trip_end_time: timestamp (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- fare: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- trip_total: double (nullable = true)



In [14]:
df.show(5)

+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
|             taxi_id|    trip_start_time|      trip_end_time|payment_type|trip_miles| fare|tips|trip_total|
+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
|4c64024caa0fc8132...|2023-03-01 00:00:00|2023-03-01 00:15:00|        Cash|       0.1| 11.0| 0.0|      11.0|
|9b934e34b3c5c2b3e...|2023-03-01 00:00:00|2023-03-01 00:15:00|     Unknown|       0.0|20.75| 0.0|     20.75|
|93928e06ae0917b94...|2023-03-01 00:00:00|2023-03-01 00:15:00| Credit Card|       7.2| 19.5|10.0|      34.0|
|ccbebd87160b9da51...|2023-03-01 00:00:00|2023-03-01 00:30:00|        Cash|      6.86| 22.5| 0.0|      27.5|
|ef28bdc517ee8c312...|2023-03-01 00:00:00|2023-03-01 00:30:00| Credit Card|     15.64| 39.0| 8.7|      52.2|
+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
only showing top 5 

Extract the month from the column trip_start_time to create a new column called "trip_month"

In [15]:
from pyspark.sql.functions import month 

In [16]:
df_month = df.withColumn("trip_month",month("trip_start_time"))

In [17]:
df.show(10)

+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
|             taxi_id|    trip_start_time|      trip_end_time|payment_type|trip_miles| fare|tips|trip_total|
+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+
|4c64024caa0fc8132...|2023-03-01 00:00:00|2023-03-01 00:15:00|        Cash|       0.1| 11.0| 0.0|      11.0|
|9b934e34b3c5c2b3e...|2023-03-01 00:00:00|2023-03-01 00:15:00|     Unknown|       0.0|20.75| 0.0|     20.75|
|93928e06ae0917b94...|2023-03-01 00:00:00|2023-03-01 00:15:00| Credit Card|       7.2| 19.5|10.0|      34.0|
|ccbebd87160b9da51...|2023-03-01 00:00:00|2023-03-01 00:30:00|        Cash|      6.86| 22.5| 0.0|      27.5|
|ef28bdc517ee8c312...|2023-03-01 00:00:00|2023-03-01 00:30:00| Credit Card|     15.64| 39.0| 8.7|      52.2|
|51ffa6b1253b4c904...|2023-03-01 00:00:00|2023-03-01 00:00:00| Credit Card|       0.0| 3.25|2.25|      13.0|
|735d1e4b889747f5f.

Fetch the Total "miles" and Total "fare" for every "trip_month" by summing up trip_miles and trip_fare and grouping by trip_month  

In [18]:
from pyspark.sql.functions import sum 

In [21]:
df_new = df.withColumn("trip_month", month("trip_start_time"))

In [22]:
monthly_totals = df_new.groupBy("trip_month").agg(
    sum("trip_miles").alias("total_miles"),
    sum("fare").alias("total_fare"))

In [24]:
monthly_totals.show()

+----------+------------------+-----------------+
|trip_month|       total_miles|       total_fare|
+----------+------------------+-----------------+
|         3|28992.179999999986|97991.12999999999|
+----------+------------------+-----------------+



Fetch the Total "miles" and Total "fare" for every "trip_month" & "payment_type" by summing up trip_miles and trip_fare and grouping by payment_type and trip_month



In [25]:
monthly_payment_totals = df_new.groupBy("trip_month", "payment_type").agg(
    sum("trip_miles").alias("total_miles"),
    sum("fare").alias("total_fare"))


In [26]:
monthly_payment_totals.show()

+----------+------------+------------------+------------------+
|trip_month|payment_type|       total_miles|        total_fare|
+----------+------------+------------------+------------------+
|         3|      Mobile|3763.7099999999987|13538.330000000002|
|         3|        Cash| 5702.560000000005|          22827.04|
|         3|   No Charge|              15.5|              43.5|
|         3|      Prcard| 6026.969999999995|          17489.86|
|         3| Credit Card|12095.240000000007|          37939.65|
|         3|     Unknown|1363.4999999999998|            6083.5|
|         3|     Dispute|              24.7|             69.25|
+----------+------------+------------------+------------------+



 Fetch the "average fare" for every "trip_month" and "payment_type"

In [27]:
from pyspark.sql.functions import avg 

In [29]:
average_fare = df_new.groupBy("trip_month","payment_type").agg(
                avg("fare").alias("average_fare"))

In [30]:
average_fare.show()

+----------+------------+------------------+
|trip_month|payment_type|      average_fare|
+----------+------------+------------------+
|         3|      Mobile|15.597154377880186|
|         3|        Cash|14.803527885862517|
|         3|   No Charge|             21.75|
|         3|      Prcard|  25.4583114992722|
|         3| Credit Card|23.233098591549297|
|         3|     Unknown| 23.21946564885496|
|         3|     Dispute|23.083333333333332|
+----------+------------+------------------+



Calculate the difference between trip_start_time and trip_end_time and store this result in a new column : trip_duration_seconds

In [31]:
from pyspark.sql.functions import unix_timestamp, col

In [32]:
df_diff = df_new.withColumn("trip_duration_seconds", 
        (unix_timestamp("trip_end_time") - unix_timestamp("trip_start_time")))


In [33]:
df_diff.show(10)

+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+----------+---------------------+
|             taxi_id|    trip_start_time|      trip_end_time|payment_type|trip_miles| fare|tips|trip_total|trip_month|trip_duration_seconds|
+--------------------+-------------------+-------------------+------------+----------+-----+----+----------+----------+---------------------+
|4c64024caa0fc8132...|2023-03-01 00:00:00|2023-03-01 00:15:00|        Cash|       0.1| 11.0| 0.0|      11.0|         3|                  900|
|9b934e34b3c5c2b3e...|2023-03-01 00:00:00|2023-03-01 00:15:00|     Unknown|       0.0|20.75| 0.0|     20.75|         3|                  900|
|93928e06ae0917b94...|2023-03-01 00:00:00|2023-03-01 00:15:00| Credit Card|       7.2| 19.5|10.0|      34.0|         3|                  900|
|ccbebd87160b9da51...|2023-03-01 00:00:00|2023-03-01 00:30:00|        Cash|      6.86| 22.5| 0.0|      27.5|         3|                 1800|
|ef28b

Create a new column called trip_duration_minutes which will be the duration in minutes created from trip_duration_seconds 

In [34]:
from pyspark.sql.functions import col 

In [35]:
df_minutes  = df_diff.withColumn("trip_duration_minutes", col("trip_duration_seconds") / 60)


In [36]:
df_minutes.show(15)

+--------------------+-------------------+-------------------+------------+----------+-----+-----+----------+----------+---------------------+---------------------+
|             taxi_id|    trip_start_time|      trip_end_time|payment_type|trip_miles| fare| tips|trip_total|trip_month|trip_duration_seconds|trip_duration_minutes|
+--------------------+-------------------+-------------------+------------+----------+-----+-----+----------+----------+---------------------+---------------------+
|4c64024caa0fc8132...|2023-03-01 00:00:00|2023-03-01 00:15:00|        Cash|       0.1| 11.0|  0.0|      11.0|         3|                  900|                 15.0|
|9b934e34b3c5c2b3e...|2023-03-01 00:00:00|2023-03-01 00:15:00|     Unknown|       0.0|20.75|  0.0|     20.75|         3|                  900|                 15.0|
|93928e06ae0917b94...|2023-03-01 00:00:00|2023-03-01 00:15:00| Credit Card|       7.2| 19.5| 10.0|      34.0|         3|                  900|                 15.0|
|ccbebd871