#### Importing Libraries

In [None]:
from pyspark.sql.functions import *

#### Data Cleansing and Preparation

In [None]:
nyc_yellowtaxi_raw_df = spark.read.table("nyc_yellowtaxi_raw")

nytaxi_yellowtaxi_prep_df = nyc_yellowtaxi_raw_df.withColumn('pickupDate', col('tpepPickupDateTime').cast('date'))\
                           .withColumn("weekDay", dayofweek(col("tpepPickupDateTime")))\
                           .withColumn("weekDayName", date_format(col("tpepPickupDateTime"), "EEEE"))\
                           .withColumn("dayofMonth", dayofweek(col("tpepPickupDateTime")))\
                           .withColumn("pickupHour", hour(col("tpepPickupDateTime")))\
                           .withColumn("tripDuration", (unix_timestamp(col("tpepDropoffDateTime")) - unix_timestamp(col("tpepPickupDateTime")))/60)\
                           .withColumn("timeBins", when((col("pickupHour") >=7) & (col("pickupHour")<=10) ,"MorningRush")\
                           .when((col("pickupHour") >=11) & (col("pickupHour")<=15) ,"Afternoon")\
                           .when((col("pickupHour") >=16) & (col("pickupHour")<=19) ,"EveningRush")\
                           .when((col("pickupHour") <=6) | (col("pickupHour")>=20) ,"Night"))

In [None]:
nytaxi_yellowtaxi_clean_df = nytaxi_yellowtaxi_prep_df.filter("""
    fareAmount > 0 AND fareAmount < 100 
    AND tripDuration > 0 AND tripDuration <= 180
    AND passengerCount > 0 AND passengerCount <= 8
    AND tripDistance > 0 AND tripDistance < 100 
    AND startLat IS NOT NULL AND startLon IS NOT NULL 
    AND endLat IS NOT NULL AND endLon IS NOT NULL
    """)

In [None]:
table_name = "nyc_yellowtaxi_clean"
nytaxi_yellowtaxi_clean_df.write.mode("overwrite").format("delta").saveAsTable(f"{table_name}")
print(f"Spark dataframe (nytaxi_yellowtaxi_clean_df) saved to a delta table: {table_name}")

In [None]:
display(nytaxi_yellowtaxi_clean_df)