# Prerequisites

1. Unzip the 7z file. Manual task.
2. `pip install findspark` to the docker
3. Clear the parquet file directory

In [1]:
import os

# Define the name of the directory to be created
dirs = ["data","data/in","data/out","data/out/table"]

for directory in dirs:
    if os.path.exists(directory):
        print(f"{directory} already exists")
    else:
        os.mkdir(directory)
        print(f"Directory '{directory}' created")

Directory 'data' created
Directory 'data/in' created
Directory 'data/out' created
Directory 'data/out/table' created


### Run this to cleanup the out folder

In [9]:
import shutil

# Define the path to the directory containing Parquet files
file_name="trip_data"
raw_table_name="raw_trip_data_table"
parq_output_files=["data/out/"+file_name+".parquet",
                  "data/out/table/"+raw_table_name]

# Check if the directory exists
for file in parq_output_files:
    if os.path.exists(file):
        shutil.rmtree(file)
        print(f'{file} has been deleted.')
    else:
        print(f'{file} does not exist.')

data/out/trip_data.parquet does not exist.
data/out/table/raw_trip_data_table has been deleted.


# 01 - Prepare the data

## 1.1 Load csv files to Raw layer

Raw Layer contains the base format of data. We only add bucketing and partitioning to it.

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read GeoJSON with PySpark") \
    .config("spark.executor.memory","6g") \
    .config("spark.driver.memory","12g") \
    .config("spark.dynamicAllocation.minExecutors", "2")\
    .config("spark.dynamicAllocation.maxExecutors", "2")\
    .config("spark.memory.fraction", ".8")\
    .config("spark.sql.shuffle.partitions", "200") \
    .enableHiveSupport()\
    .config("spark.sql.warehouse.dir", "data/out/table")\
    .getOrCreate()


In [4]:
#spark.sparkContext.setLogLevel("DEBUG")

In [10]:

file_name="trip_data"
file_prefix="data/in/"+file_name+"_"
# List of CSV files
#csv_file_paths = [file_prefix +"test1.csv",file_prefix +"test2.csv"]
csv_file_paths = [file_prefix + "1.csv" , file_prefix + "2.csv", file_prefix + "3.csv",
                  file_prefix + "4.csv", file_prefix + "5.csv", file_prefix + "6.csv",
                  file_prefix + "7.csv", file_prefix + "8.csv", file_prefix + "9.csv",
                  file_prefix + "10.csv", file_prefix + "11.csv", file_prefix + "12.csv"]

In [11]:
from pyspark.sql.functions import col,to_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType

column_partitioner="medallion"
# Read the CSV files into a single DataFrame
raw_file_schema= StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])


In [12]:
df = spark.read.csv(csv_file_paths, header=True, schema=raw_file_schema).persist()

In [13]:
bucket_count=400

# To reset the table
spark.sql("DROP TABLE IF EXISTS "+raw_table_name)
df.withColumn("pickup_date", to_date(col("pickup_datetime")))\
    .write\
    .format("parquet")\
    .bucketBy(bucket_count,column_partitioner)\
    .partitionBy("pickup_date")\
    .sortBy("pickup_datetime")\
    .mode("append")\
    .saveAsTable(raw_table_name)
df.unpersist()    
print("Writing parquet file successful.")

Writing parquet file successful.


## 1.2 Display bucketed and partitioned table

In [14]:
trip_data_bucketed = spark.table(raw_table_name)
#trip_data_bucketed.show()
spark.sql("SHOW TABLES").show(truncate=False)
spark.sql("SHOW PARTITIONS "+raw_table_name).show(truncate=False)

+---------+-------------------+-----------+
|namespace|tableName          |isTemporary|
+---------+-------------------+-----------+
|default  |raw_trip_data_table|false      |
+---------+-------------------+-----------+

+----------------------+
|partition             |
+----------------------+
|pickup_date=2013-01-01|
|pickup_date=2013-01-02|
|pickup_date=2013-01-03|
|pickup_date=2013-01-04|
|pickup_date=2013-01-05|
|pickup_date=2013-01-06|
|pickup_date=2013-01-07|
|pickup_date=2013-01-08|
|pickup_date=2013-01-09|
|pickup_date=2013-01-10|
|pickup_date=2013-01-11|
|pickup_date=2013-01-12|
|pickup_date=2013-01-13|
|pickup_date=2013-01-14|
|pickup_date=2013-01-15|
|pickup_date=2013-01-16|
|pickup_date=2013-01-17|
|pickup_date=2013-01-18|
|pickup_date=2013-01-19|
|pickup_date=2013-01-20|
+----------------------+
only showing top 20 rows



In [16]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, LongType
# Create UDF for finding idle time between two times
def idle_time_ms(start_ms, prev_end_ms):
    if start_ms is None or prev_end_ms is None:
        return 0
    idle_ms = start_ms - prev_end_ms
    threshold_duration_ms = 4 * 60 * 60 * 1000  # 4 hours in milliseconds
    if idle_ms < 0 or idle_ms > threshold_duration_ms:
        return 0
    else:
        return idle_ms
    
# Define as UDF
idle_time_ms_udf = udf(idle_time_ms, LongType())

In [17]:
from pyspark.sql import Window
from pyspark.sql.functions import lag, unix_timestamp, sum, col
from datetime import datetime

# Configure the window
# Medallion refers to the vehicle, hack_license refers to the driver
window_conf = Window.partitionBy(col("medallion")).orderBy(col("pickup_datetime"))

# TODO: Add shuffling here
#   ...
taxi_util_data = trip_data_bucketed

# Add column with pickup datetime of previous
# NB! The default value for the lag might not be correct
taxi_util_data_sorted = taxi_util_data.withColumn("dropoff_datetime_prev", lag(col("dropoff_datetime"), default=datetime.min).over(window_conf))

# Convert to timestamp
taxi_util_data_sorted_ts = taxi_util_data_sorted.withColumn("pickup_ts_ms", unix_timestamp("pickup_datetime") * 1000) \
    .withColumn("dropoff_prev_ts_ms", unix_timestamp("dropoff_datetime_prev") * 1000)

# Calculate idle time per ride
taxi_util_data_idle = taxi_util_data_sorted_ts.withColumn("idle_time_ms", idle_time_ms_udf("pickup_ts_ms", "dropoff_prev_ts_ms"))

# Calculate total idle time per taxi
taxi_util_data_idle_total = taxi_util_data_idle.groupBy(col("medallion")).agg(sum(col("idle_time_ms")))

In [18]:
taxi_util_data_idle_total.show()

+--------------------+-----------------+
|           medallion|sum(idle_time_ms)|
+--------------------+-----------------+
|89D227B655E5C82AE...|                0|
|0BD7C8F5BA12B88E0...|                0|
|DFD2202EE08F7A8DC...|                0|
+--------------------+-----------------+

