# 00 - Prerequisites

1. Unzip the 7z file. Manual task.
2. Clear the parquet file directory

In [1]:
import os

# Define the name of the directory to be created
dirs = ["data","data/in","data/out","data/out/table"]

for directory in dirs:
    if os.path.exists(directory):
        print(f"{directory} already exists")
    else:
        os.mkdir(directory)
        print(f"Directory '{directory}' created")

Directory 'data' created
Directory 'data/in' created
Directory 'data/out' created
Directory 'data/out/table' created


### Run this to cleanup/reset the out folder

In [2]:
import shutil

# Define the path to the directory containing Parquet files
file_name="trip_data"
raw_table_name="raw_trip_data_table"
parq_output_files=["data/out/"+file_name+".parquet",
                  "data/out/table/"+raw_table_name]

# Check if the directory exists
for file in parq_output_files:
    if os.path.exists(file):
        shutil.rmtree(file)
        print(f'{file} has been deleted.')
    else:
        print(f'{file} does not exist.')

data/out/trip_data.parquet does not exist.
data/out/table/raw_trip_data_table does not exist.


# 01 - Prepare the data

## 1.1 Load csv files to hive table. Let's call it Raw layer

Raw Layer contains the base format of data. We only add bucketing and partitioning to it.

### 1.1.1 Spark Setup

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BDM2024 - Project01") \
    .config("spark.sql.shuffle.partitions", "200") \
    .enableHiveSupport() \
    .config("spark.sql.warehouse.dir", "data/out/table") \
    .getOrCreate()


In [11]:
# Enable below if you are resetting the docker image too.
# This prouces a gazillion logs and you will scratch your head why are you losing so much disk space
#spark.sparkContext.setLogLevel("DEBUG")

### 1.1.2. List the files to be ingested.

In [12]:
file_name="trip_data"
file_prefix="data/in/"+file_name+"_"
# List of CSV files
#csv_file_paths = [file_prefix +"test1.csv",file_prefix +"test2.csv"]
csv_file_paths = [file_prefix + "1.csv" , file_prefix + "2.csv", file_prefix + "3.csv",
                  file_prefix + "4.csv", file_prefix + "5.csv", file_prefix + "6.csv",
                  file_prefix + "7.csv", file_prefix + "8.csv", file_prefix + "9.csv",
                  file_prefix + "10.csv", file_prefix + "11.csv", file_prefix + "12.csv"]

### 1.1.3. Declare the schema to save some resources.

In [13]:
from pyspark.sql.functions import col,to_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType

column_partitioner="medallion"

raw_file_schema= StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])


### 1.1.4 Read the csv files

Add the persists so that the file remains to disk

In [14]:
df = spark.read.csv(csv_file_paths, header=True, schema=raw_file_schema).persist()

### 1.1.5. Create the hive table

Apply `bucketing` and `partitioning` to the table.

In [15]:
bucket_count=400

# To reset the table
spark.sql("DROP TABLE IF EXISTS "+raw_table_name)
df.withColumn("pickup_date", to_date(col("pickup_datetime")))\
    .write\
    .format("parquet")\
    .bucketBy(bucket_count,column_partitioner)\
    .partitionBy("pickup_date")\
    .sortBy("pickup_datetime")\
    .mode("append")\
    .saveAsTable(raw_table_name)
df.unpersist()    
print("Writing parquet file successful.")

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
           ^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <exception str() failed>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  

Py4JError: org.apache.spark.util.Utils does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


### 1.1.6 Sanity Checks

Check the details from the Raw Layer table

In [14]:
trip_data_bucketed = spark.table(raw_table_name)

spark.sql("SHOW TABLES").show(truncate=False)
spark.sql("SHOW PARTITIONS "+raw_table_name).show(truncate=False)

+---------+-------------------+-----------+
|namespace|tableName          |isTemporary|
+---------+-------------------+-----------+
|default  |raw_trip_data_table|false      |
+---------+-------------------+-----------+

+----------------------+
|partition             |
+----------------------+
|pickup_date=2013-01-01|
|pickup_date=2013-01-02|
|pickup_date=2013-01-03|
|pickup_date=2013-01-04|
|pickup_date=2013-01-05|
|pickup_date=2013-01-06|
|pickup_date=2013-01-07|
|pickup_date=2013-01-08|
|pickup_date=2013-01-09|
|pickup_date=2013-01-10|
|pickup_date=2013-01-11|
|pickup_date=2013-01-12|
|pickup_date=2013-01-13|
|pickup_date=2013-01-14|
|pickup_date=2013-01-15|
|pickup_date=2013-01-16|
|pickup_date=2013-01-17|
|pickup_date=2013-01-18|
|pickup_date=2013-01-19|
|pickup_date=2013-01-20|
+----------------------+
only showing top 20 rows



## 1.2 Enrich the data with geojson

### 1.2.1. Read geojson file

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, LongType, ArrayType

# https://gist.github.com/JulesBelveze/a552e8c53dfd1f46948cbeb32c096611
geojson_data="data/in/nyc-boroughs.geojson"
# Define the custom schema
geojson_schema = StructType([
    StructField("type", StringType(), nullable=True),
    StructField("id", LongType(), nullable=True),
    StructField("properties", StructType([
        StructField("boroughCode", LongType(), nullable=True),
        StructField("borough", StringType(), nullable=True),
        StructField("@id", StringType(), nullable=True)
    ]), nullable=True),
    StructField("geometry", StructType([
        StructField("type", StringType(), nullable=True),
        StructField("coordinates", ArrayType(ArrayType(ArrayType(DoubleType()))), nullable=True)
    ]), nullable=True)
])

# Read GeoJSON file with custom schema
geojson_df = spark.read.schema(geojson_schema).json(geojson_data)

In [5]:
# View the schema
geojson_df.printSchema()

# Show the data
geojson_df.show()
geojson_df = geojson_df.na.drop() #remove the NULL values

# Show the resulting DataFrame
geojson_df.show()

num_rows = geojson_df.count()
print(f"Number of boroughs: {num_rows}")

root
 |-- type: string (nullable = true)
 |-- id: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- boroughCode: long (nullable = true)
 |    |-- borough: string (nullable = true)
 |    |-- @id: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)

+-------+---+--------------------+--------------------+
|   type| id|          properties|            geometry|
+-------+---+--------------------+--------------------+
|Feature|  0|{5, Staten Island...|{Polygon, [[[-74....|
|Feature|  1|{5, Staten Island...|{Polygon, [[[-74....|
|Feature|  2|{5, Staten Island...|{Polygon, [[[-74....|
|Feature|  3|{5, Staten Island...|{Polygon, [[[-74....|
|Feature|  4|{4, Queens, http:...|{Polygon, [[[-73....|
|Featu

In [6]:
!pip install shapely

[0mCould not fetch URL https://pypi.org/simple/shapely/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/shapely/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)'))) - skipping
[31mERROR: Could not find a version that satisfies the requirement shapely (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for shapely[0m[31m
[0mCould not fetch URL https://pypi.org/simple/pip/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/pip/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)'))) - skipping


In [16]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, LongType
# Create UDF for finding idle time between two times
def idle_time_ms(start_ms, prev_end_ms):
    if start_ms is None or prev_end_ms is None:
        return 0
    idle_ms = start_ms - prev_end_ms
    threshold_duration_ms = 4 * 60 * 60 * 1000  # 4 hours in milliseconds
    if idle_ms < 0 or idle_ms > threshold_duration_ms:
        return 0
    else:
        return idle_ms
    
# Define as UDF
idle_time_ms_udf = udf(idle_time_ms, LongType())

In [17]:
from pyspark.sql import Window
from pyspark.sql.functions import lag, unix_timestamp, sum, col
from datetime import datetime

# Configure the window
# Medallion refers to the vehicle, hack_license refers to the driver
window_conf = Window.partitionBy(col("medallion")).orderBy(col("pickup_datetime"))

# TODO: Add shuffling here
#   ...
taxi_util_data = trip_data_bucketed

# Add column with pickup datetime of previous
# NB! The default value for the lag might not be correct
taxi_util_data_sorted = taxi_util_data.withColumn("dropoff_datetime_prev", lag(col("dropoff_datetime"), default=datetime.min).over(window_conf))

# Convert to timestamp
taxi_util_data_sorted_ts = taxi_util_data_sorted.withColumn("pickup_ts_ms", unix_timestamp("pickup_datetime") * 1000) \
    .withColumn("dropoff_prev_ts_ms", unix_timestamp("dropoff_datetime_prev") * 1000)

# Calculate idle time per ride
taxi_util_data_idle = taxi_util_data_sorted_ts.withColumn("idle_time_ms", idle_time_ms_udf("pickup_ts_ms", "dropoff_prev_ts_ms"))

# Calculate total idle time per taxi
taxi_util_data_idle_total = taxi_util_data_idle.groupBy(col("medallion")).agg(sum(col("idle_time_ms")))

In [18]:
taxi_util_data_idle_total.show()

+--------------------+-----------------+
|           medallion|sum(idle_time_ms)|
+--------------------+-----------------+
|89D227B655E5C82AE...|                0|
|0BD7C8F5BA12B88E0...|                0|
|DFD2202EE08F7A8DC...|                0|
+--------------------+-----------------+

