### Purpose of this notebook is to run the code for the homework of module 5

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
# Add the vevn to spark's settings, so inject the venv’s Python into both driver & worker configs before recreating the session, to find the right python interpreter
import os
venv_python = r"C:\Sandeep SSD\Programming SSD\Data Engineering Zoomcamp\data-engineering-zoomcamp\dataenginzoomvenv\Scripts\python.exe"

# 1) Ensure the worker uses exactly this Python executable:
os.environ['PYSPARK_PYTHON'] = venv_python
os.environ['PYSPARK_DRIVER_PYTHON'] = venv_python


In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.pyspark.python", venv_python) \
    .config("spark.pyspark.driver.python", venv_python) \
    .getOrCreate()

In [5]:
# Check which port the Spark UI is running on
print(spark.sparkContext.uiWebUrl)

http://192.168.0.181:4040


In [None]:
# Print spark to get all the spark info
spark

Question 1: Install Spark and PySpark <br>
#Mine is 3.4.2

### For Question 2: Yellow October 2024

In [7]:
from pyspark.sql import types

In [None]:
# Define a schema for yellow data
# Had to fix passengercount, ratecodeid and paymenttype to longtypes... Because they were written as int64 in the parquet, but integertype only accepts int32, and spark won't silently cast it... 
yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.LongType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.LongType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.LongType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [21]:
# Read the yellow 2024 data
input = '../../Data/data/nyc-tlc/yellow_tripdata_2024-10.parquet'
output_path = '../../Data/data/nyc-tlc/yellow_tripdata_2024-10_Homework_module_5.parquet'

df_yellow = spark.read \
        .schema(yellow_schema) \
        .parquet(input)


df_yellow

DataFrame[VendorID: int, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: bigint, trip_distance: double, RatecodeID: bigint, store_and_fwd_flag: string, PULocationID: int, DOLocationID: int, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double]

In [22]:

# And write it back out, but with 
df_yellow \
    .repartition(4) \
    .write \
    .mode("overwrite") \
    .parquet(output_path)

And the answer is around 24mb each, checked in file explorer, easiest way... 

### Question 3: Count records
How many taxi trips were there on the 15th of October?
Consider only trips that started on the 15th of October.


In [25]:
from pyspark.sql import functions as F

In [31]:

df_yellow \
    .withColumn('tpep_pickup_datetime', F.to_date(df_yellow.tpep_pickup_datetime)) \
    .filter("tpep_pickup_datetime = '2024-10-15'") \
    .count()



125567

Answer is 125567 taxi trips

### Question 4: Longest trip
What is the length of the longest trip in the dataset in hours?



In [33]:
# Create a dataframe that gets the duration in hours by casting the timestamps to longs, which will give us a number in seconds, substracting the dropoff from pickup, and dividing by 3600 to get hours
df_with_duration = df_yellow.withColumn(
    "duration_hours",
        (
            F.col("tpep_dropoff_datetime").cast("long")
            - F.col("tpep_pickup_datetime").cast("long")
        )
        / 3600.0
)

df_with_duration.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|      duration_hours|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+--------------------+
|       2| 2024-10-01 02:30:44|  2024-10-01 02:48:26|              1|          3.0|         1|                 N|         162|         246|           1|       1

In [34]:
# Then to get the maximum we just aggregate
df_with_duration \
    .agg(F.max("duration_hours").alias("max_duration_hours")) \
    .show()


+------------------+
|max_duration_hours|
+------------------+
|162.61777777777777|
+------------------+



In [36]:
# Since we now have duration hours as an extra column, we can also just order by duration hours in desc, and then limit by 1 to get it
df_with_duration \
    .select("duration_hours") \
    .orderBy(F.desc("duration_hours")) \
    .limit(1) \
    .show()

+------------------+
|    duration_hours|
+------------------+
|162.61777777777777|
+------------------+



So the answer is 162 hours, a very long trip...

### Question 5: User Interface
Spark’s User Interface which shows the application's dashboard runs on which local port?



In [38]:
# Can check the webui url via this command
print(spark.sparkContext.uiWebUrl)

http://192.168.0.181:4040


So answer is port 4040

### Question 6: Least frequent pickup location zone
Load the zone lookup data into a temp view in Spark:

wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone?



In [39]:
# Read from the zones csv
df_zones = spark.read.parquet('../../Data/data/csv/zones/spark_parquet/')

In [40]:
df_zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [44]:
# Joins the join dataframe to the zones df, on the condition zone == locationid
df_joined = df_yellow.join(df_zones, df_yellow.PULocationID == df_zones.LocationID)

In [45]:
df_joined.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+----------+---------+--------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|LocationID|  Borough|                Zone|service_zone|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+----------+---------+--------------------+------------+
|       2| 2024-10-01 02:30:44|  2024-10-01 02:48:26|     

In [None]:
# We group by zone, count it, and order by count to get the least used zone, then limit by 1 to see just that one
df_joined \
    .groupBy('Zone') \
    .count() \
    .orderBy('count', ascending=True) \
    .limit(1) \
    .show()

+--------------------+-----+
|                Zone|count|
+--------------------+-----+
|Governor's Island...|    1|
+--------------------+-----+



So the answer is Governo's Island, with count 1