Load and query Yellow Taxi data

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col

In [18]:
# Create SparkSession
spark = SparkSession.builder\
             .master("local[1]")\
             .appName("spark-app-version-x")\
             .getOrCreate()

24/03/20 09:16:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [19]:
# Read taxi data
local_files = '/home/sasa/Downloads/Code/notebooks/datasets/parquet'
df = spark.read.parquet(local_files)

In [20]:
# Query sample:
df.select('VendorID','total_amount', 'PULocationID').show(n=5)

+--------+------------+------------+
|VendorID|total_amount|PULocationID|
+--------+------------+------------+
|       2|        11.1|         238|
|       2|       76.49|         138|
|       1|       28.05|         140|
|       1|        24.7|         140|
|       2|       14.64|          79|
+--------+------------+------------+
only showing top 5 rows



In [21]:
# Query sample, using Spark SQL
df.createOrReplaceTempView('tbl_raw_yellow_taxis')

In [22]:
# SQL Statement
# PULocationID = 188, 379 rows out of 3,066,766
spark.sql('''
          select min(tpep_pickup_datetime), max(tpep_dropoff_datetime)
          from tbl_raw_yellow_taxis
          ''').show(n=5)

+-------------------------+--------------------------+
|min(tpep_pickup_datetime)|max(tpep_dropoff_datetime)|
+-------------------------+--------------------------+
|      2001-01-01 00:06:49|       2023-05-03 23:19:31|
+-------------------------+--------------------------+



In [23]:
# SQL Statement
spark.sql('''
          select extract(year from tpep_pickup_datetime), count(1)
          from tbl_raw_yellow_taxis
          group by extract(year from tpep_pickup_datetime)
          having count(1) > 100
          ''').show(n=100)

[Stage 5:>                                                          (0 + 1) / 2]

+---------------------------------------+--------+
|extract(year FROM tpep_pickup_datetime)|count(1)|
+---------------------------------------+--------+
|                                   2023| 9605947|
+---------------------------------------+--------+



                                                                                

In [24]:
# SQL Statement example, using a subquery to clean the data
# Use case example: imagine our business users asked to us delete all data if dataset's year has < 100 rows.
df_clean_s1 = spark.sql('''
          select *
          from tbl_raw_yellow_taxis
          where extract(year from tpep_pickup_datetime) in
                        (select extract(year from tpep_pickup_datetime)
                        from tbl_raw_yellow_taxis
                        group by extract(year from tpep_pickup_datetime)
                        having count(1) > 100
                        )
          ''')

In [25]:
# Register new Temp View, using the cleansed new DataFrame 
df_clean_s1.createOrReplaceTempView('tbl_raw_yellow_taxis_clean_s1')

In [26]:
# SQL Statement
spark.sql('''
          select min(tpep_pickup_datetime), max(tpep_dropoff_datetime)
          from tbl_raw_yellow_taxis_clean_s1
          ''').show(n=5)

[Stage 13:>                                                         (0 + 1) / 1]

+-------------------------+--------------------------+
|min(tpep_pickup_datetime)|max(tpep_dropoff_datetime)|
+-------------------------+--------------------------+
|      2023-01-31 23:49:00|       2023-05-03 23:19:31|
+-------------------------+--------------------------+



                                                                                

In [27]:
# Create new partition key
df_sink = df_clean_s1.withColumn("p_date",to_date(col('tpep_pickup_datetime')))

In [28]:
# Write to local storage, if not done already:
df_sink.write.partitionBy("p_date").mode("overwrite").parquet("/home/sasa/Downloads/Code/notebooks/datasets/yellow_taxis_daily")

                                                                                

In [29]:
# Stop the session
spark.stop()