In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types, functions as F

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.sql.legacy.parquet.nanosAsLong", "true") \
    .getOrCreate()

24/08/11 07:30:37 WARN Utils: Your hostname, codespaces-0918c7 resolves to a loopback address: 127.0.0.1; using 10.0.0.37 instead (on interface eth0)
24/08/11 07:30:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/11 07:30:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [43]:
spark.version

'3.3.2'

In [4]:
data_folder = 'taxi_data/'
data_fhv = 'fhv/'
file_name = f'fhv_tripdata_2019-10.csv.gz'
file_path = data_folder+ data_fhv + file_name

In [13]:
fhv_schema = types.StructType([
    types.StructField("dispatching_base_num", types.StringType(), True),
    types.StructField("pickup_datetime", types.TimestampType(), True),
    types.StructField("dropOff_datetime", types.TimestampType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("SR_Flag", types.StringType(), True),
    types.StructField("Affiliated_base_number", types.StringType(), True)
   
])

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-01.parquet

In [19]:
pq_name = f'fhv_tripdata_2019-10'
print(f'Converting {file_name} to {pq_name}')
df_fhv = spark.read \
    .option("header", "true") \
    .schema(fhv_schema) \
    .csv(file_path)
df_fhv \
    .repartition(6) \
    .write.parquet(data_folder+ data_fhv + pq_name, mode='overwrite')

Converting fhv_tripdata_2019-10.csv.gz to fhv_tripdata_2019-10


                                                                                

In [18]:
df_fhv.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   null|                B00014|
|     B00021         |2019-10-01 00:00:4

In [25]:
df_fhv.filter(F.to_date(df_fhv['pickup_datetime']) == '2019-10-15' ).count()

                                                                                

62610

In [33]:
df_fhv \
    .withColumn('travel_duration_h', F.datediff(df_fhv['dropOff_datetime'], df_fhv['pickup_datetime'])*24 ) \
    .sort('travel_duration_h', ascending=False) \
    .show(10)

[Stage 28:>                                                         (0 + 1) / 1]

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|travel_duration_h|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----------------+
|              B02832|2019-10-11 18:00:00|2091-10-11 18:30:00|         264|         264|   null|                B02832|           631152|
|              B02832|2019-10-28 09:00:00|2091-10-28 09:30:00|         264|         264|   null|                B02832|           631152|
|              B02416|2019-10-31 23:46:33|2029-11-01 00:13:00|        null|        null|   null|                B02416|            87696|
|     B00746         |2019-10-01 21:43:42|2027-10-01 21:45:23|         159|         264|   null|       B00746         |            70128|
|              B02921|2019-10-17 1

                                                                                

In [39]:
zones = './taxi_data/zones/*'
df_zones = spark.read \
    .parquet(zones)

df_zones.tail(5)

[Row(LocationID='261', Borough='Manhattan', Zone='World Trade Center', service_zone='Yellow Zone'),
 Row(LocationID='262', Borough='Manhattan', Zone='Yorkville East', service_zone='Yellow Zone'),
 Row(LocationID='263', Borough='Manhattan', Zone='Yorkville West', service_zone='Yellow Zone'),
 Row(LocationID='264', Borough='Unknown', Zone='NV', service_zone='N/A'),
 Row(LocationID='265', Borough='Unknown', Zone='NA', service_zone='N/A')]

In [37]:
df_join =df_fhv.join(df_zones, df_fhv['PULocationID'] == df_zones['LocationID'] , how='left')

In [38]:
df_join.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+----------+-------+----+------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|LocationID|Borough|Zone|service_zone|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+----------+-------+----+------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|       264|Unknown|  NV|         N/A|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|       264|Unknown|  NV|         N/A|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|       264|Unknown|  NV|         N/A|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57

In [42]:
df_join \
    .groupBy('Zone').count().sort('count').show()

[Stage 40:>                                                         (0 + 1) / 1]

+--------------------+-----+
|                Zone|count|
+--------------------+-----+
|         Jamaica Bay|    1|
|Governor's Island...|    2|
| Green-Wood Cemetery|    5|
|       Broad Channel|    8|
|     Highbridge Park|   14|
|        Battery Park|   15|
|Saint Michaels Ce...|   23|
|Breezy Point/Fort...|   25|
|Marine Park/Floyd...|   26|
|        Astoria Park|   29|
|    Inwood Hill Park|   39|
|       Willets Point|   47|
|Forest Park/Highl...|   53|
|  Brooklyn Navy Yard|   57|
|        Crotona Park|   62|
|        Country Club|   77|
|     Freshkills Park|   89|
|       Prospect Park|   98|
|     Columbia Street|  105|
|  South Williamsburg|  110|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [46]:
# alternative solution
df_join \
    .groupBy('Zone').count().filter((df_join['Zone'] == 'East Chelsea') |
                                    (df_join['Zone'] == 'Jamaica Bay')|
                                    (df_join['Zone'] == 'Union Sq')|
                                    (df_join['Zone'] =='Crown Heights North')).show()

[Stage 44:>                                                         (0 + 1) / 1]

+-------------------+-----+
|               Zone|count|
+-------------------+-----+
|Crown Heights North| 3036|
|           Union Sq| 1893|
|        Jamaica Bay|    1|
|       East Chelsea| 1381|
+-------------------+-----+



                                                                                