# Prerequisites

1. Unzip the 7z file. Manual task.
2. `pip install findspark` to the docker
3. Clear the parquet file directory

In [2]:
import os
import shutil

# Define the path to the directory containing Parquet files
file_name="trip_data"
table_name="trip_data_table"
parq_output_files=["data/out/"+file_name+".parquet",
                  "data/out/table/"+table_name]

# Check if the directory exists
for file in parq_output_files:
    if os.path.exists(file):
        shutil.rmtree(file)
        print(f'{file} has been deleted.')
    else:
        print(f'{file} does not exist.')

data/out/trip_data.parquet does not exist.
data/out/table/trip_data_table does not exist.


# 01 - Prepare the data

## 1.1 Partition data and save as parquet file

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read GeoJSON with PySpark") \
    .config("spark.master", "local[6]")\
    #.config("spark.worker.memory","2g") \
    #.config("spark.worker.cores","4") \
    .config("spark.executor.memory","2g") \
    .config("spark.executor.cores", "4") \
    .config("spark.dynamicAllocation.minExecutors", "2")\
    .config("spark.dynamicAllocation.maxExecutors", "6")\
    .config("spark.driver.memory", "2g")\
    .config("spark.driver.cores", "4")\
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.warehouse.dir", "data/out/table")\
    .getOrCreate()


In [4]:
spark.sparkContext.setLogLevel("DEBUG")

In [None]:

file_name="trip_data"
file_prefix="data/in/"+file_name+"_"
# List of CSV files
csv_file_paths = [file_prefix +"test.csv"]
#csv_file_paths = [file_prefix + "1.csv" ]#, file_prefix + "2.csv"]#, file_prefix + "3.csv",
                  #file_prefix + "4.csv", file_prefix + "5.csv", file_prefix + "6.csv",
                  #file_prefix + "7.csv", file_prefix + "8.csv", file_prefix + "9.csv",
                  #file_prefix + "10.csv", file_prefix + "11.csv", file_prefix + "12.csv"]


parq_output_file="data/out/"+file_name+".parquet"
column_partitioner="medallion"
# Read the CSV files into a single DataFrame
df = spark.read.csv(csv_file_paths, header=True, inferSchema=True)
#df.show()
bucket_count=3000
df.write\
    .format("parquet")\
    .bucketBy(bucket_count,column_partitioner)\
    .mode("append")\
    .saveAsTable(table_name)
                    
print("Writing parquet file successful.")


## 1.2 Load the saved parquet format and apply bucketing

In [None]:


parq_input_file=parq_output_file
parq_df=spark.read.parquet(parq_input_file)
bucket_table_name="trip_data_bucketed"
# To reset the table
spark.sql("DROP TABLE IF EXISTS "+bucket_table_name)
# Apply bucketing and ordering
bucketed_df = parq_df.write.bucketBy(num_buckets, column_partitioner) \
                    .sortBy("pickup_datetime") \
                    .saveAsTable(bucket_table_name)
trip_data_bucketed = spark.table(bucket_table_name)
trip_data_bucketed.show()


In [16]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, LongType
# Create UDF for finding idle time between two times
def idle_time_ms(start_ms, prev_end_ms):
    if start_ms is None or prev_end_ms is None:
        return 0
    idle_ms = start_ms - prev_end_ms
    threshold_duration_ms = 4 * 60 * 60 * 1000  # 4 hours in milliseconds
    if idle_ms < 0 or idle_ms > threshold_duration_ms:
        return 0
    else:
        return idle_ms
    
# Define as UDF
idle_time_ms_udf = udf(idle_time_ms, LongType())

In [17]:
from pyspark.sql import Window
from pyspark.sql.functions import lag, unix_timestamp, sum, col
from datetime import datetime

# Configure the window
# Medallion refers to the vehicle, hack_license refers to the driver
window_conf = Window.partitionBy(col("medallion")).orderBy(col("pickup_datetime"))

# TODO: Add shuffling here
#   ...
taxi_util_data = trip_data_bucketed

# Add column with pickup datetime of previous
# NB! The default value for the lag might not be correct
taxi_util_data_sorted = taxi_util_data.withColumn("dropoff_datetime_prev", lag(col("dropoff_datetime"), default=datetime.min).over(window_conf))

# Convert to timestamp
taxi_util_data_sorted_ts = taxi_util_data_sorted.withColumn("pickup_ts_ms", unix_timestamp("pickup_datetime") * 1000) \
    .withColumn("dropoff_prev_ts_ms", unix_timestamp("dropoff_datetime_prev") * 1000)

# Calculate idle time per ride
taxi_util_data_idle = taxi_util_data_sorted_ts.withColumn("idle_time_ms", idle_time_ms_udf("pickup_ts_ms", "dropoff_prev_ts_ms"))

# Calculate total idle time per taxi
taxi_util_data_idle_total = taxi_util_data_idle.groupBy(col("medallion")).agg(sum(col("idle_time_ms")))

In [18]:
taxi_util_data_idle_total.show()

+--------------------+-----------------+
|           medallion|sum(idle_time_ms)|
+--------------------+-----------------+
|89D227B655E5C82AE...|                0|
|0BD7C8F5BA12B88E0...|                0|
|DFD2202EE08F7A8DC...|                0|
+--------------------+-----------------+

