In [4]:
import os, json
from datetime import datetime
from pyspark.sql.functions import col, current_timestamp, input_file_name, to_date, unix_timestamp

INBOX = "/home/jovyan/work/data/inbox"
OUTBOX = "/home/jovyan/work/data/outbox/trips_enriched.parquet"
LOOKUP = "/home/jovyan/work/data/taxi_zone_lookup.parquet"
STATE = "/home/jovyan/work/state/manifest.json"

os.makedirs("/home/jovyan/work/state", exist_ok=True)
os.makedirs("/home/jovyan/work/data/outbox", exist_ok=True)

def load_manifest(path):
    if not os.path.exists(path):
        return {"processed_files": []}
    with open(path, "r") as f:
        return json.load(f)

def save_manifest(path, manifest):
    with open(path, "w") as f:
        json.dump(manifest, f, indent=2)

manifest = load_manifest(STATE)
processed = set(x["filename"] for x in manifest["processed_files"])

all_files = sorted([f for f in os.listdir(INBOX) if f.endswith(".parquet")])
new_files = [f for f in all_files if f not in processed]

all_files, new_files


(['yellow_tripdata_2025-01.parquet', 'yellow_tripdata_2025-02.parquet'],
 ['yellow_tripdata_2025-01.parquet', 'yellow_tripdata_2025-02.parquet'])

In [5]:
import os

INBOX = "/home/jovyan/work/data/inbox"

os.listdir(INBOX)


['yellow_tripdata_2025-01.parquet', 'yellow_tripdata_2025-02.parquet']

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Project1-ETL").getOrCreate()

df = spark.read.parquet(
    "/home/jovyan/work/data/inbox/yellow_tripdata_2025-01.parquet"
)

df.show(5)


+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|       1| 2025-01-01 00:18:38|  2025-01-01 00:26:59|              1|          1.6|         1|                 N|         229|    

In [8]:
df.printSchema()


root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)
 |-- cbd_congestion_fee: double (nullable = true)



In [9]:
df.count()

3475226

In [10]:
df_all = spark.read.parquet("/home/jovyan/work/data/inbox/*.parquet")

df_all.count()

7052769