In [1]:
from datetime import datetime

import pyspark
from pyspark.sql import SparkSession, dataframe
from pyspark.sql import functions as F
from schemas import taxi_schema

In [2]:
spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

24/03/12 22:55:24 WARN Utils: Your hostname, avalon resolves to a loopback address: 127.0.1.1; using 192.168.18.2 instead (on interface eth0)
24/03/12 22:55:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/12 22:55:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Question 1

In [3]:
spark.version

'3.5.1'

### Question 2

In [4]:
df_fhv = (
    (spark)
    .read.option("header", "true")
    .schema(taxi_schema["fhv"])
    .csv("data/raw/fhv/2019/10")
)

In [5]:
df_fhv = df_fhv.repartition(6)
df_fhv.write.parquet("data/pq/fhv/2019/10", mode="overwrite")

                                                                                

In [6]:
!ls -lh data/pq/fhv/2019/10/

total 39M
-rw-r--r-- 1 nikki nikki 6.4M Mar 12 22:56 part-00000-84f5959c-c630-4959-bf51-2ac86399ca8b-c000.snappy.parquet
-rw-r--r-- 1 nikki nikki 6.4M Mar 12 22:56 part-00001-84f5959c-c630-4959-bf51-2ac86399ca8b-c000.snappy.parquet
-rw-r--r-- 1 nikki nikki 6.4M Mar 12 22:56 part-00002-84f5959c-c630-4959-bf51-2ac86399ca8b-c000.snappy.parquet
-rw-r--r-- 1 nikki nikki 6.4M Mar 12 22:56 part-00003-84f5959c-c630-4959-bf51-2ac86399ca8b-c000.snappy.parquet
-rw-r--r-- 1 nikki nikki 6.4M Mar 12 22:56 part-00004-84f5959c-c630-4959-bf51-2ac86399ca8b-c000.snappy.parquet
-rw-r--r-- 1 nikki nikki 6.4M Mar 12 22:56 part-00005-84f5959c-c630-4959-bf51-2ac86399ca8b-c000.snappy.parquet
-rw-r--r-- 1 nikki nikki    0 Mar 12 22:56 _SUCCESS


### Question 3

In [7]:
df_fhv.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: integer (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [11]:
(
    (df_fhv)
    .filter(F.date_trunc("day", "pickup_datetime") == datetime(2019, 10, 15))
    .count()
)

                                                                                

62610

### Question 4

In [29]:
(
    (df_fhv)
    .withColumn(
        "diff_seconds",
        F.unix_timestamp("dropoff_datetime") - F.unix_timestamp("pickup_datetime"),
    )
    .withColumn("diff_hours", F.col("diff_seconds") / 3600)
    .select(F.max("diff_hours").alias("longest_trip_hours"))
    .show(1)
)

[Stage 51:>                                                         (0 + 1) / 1]

+------------------+
|longest_trip_hours|
+------------------+
|          631152.5|
+------------------+



                                                                                

### Question 5

4040

### Question 6

In [30]:
df_zones = spark.read.parquet("data/pq/zones")

In [32]:
df_fhv.show(1)

[Stage 59:>                                                         (0 + 1) / 1]

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00310|2019-10-14 16:28:32|2019-10-14 16:32:18|         264|         213|   NULL|                B03047|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
only showing top 1 row



                                                                                

In [33]:
df_zones.show(1)

+----------+-------+--------------+------------+
|LocationID|Borough|          Zone|service_zone|
+----------+-------+--------------+------------+
|         1|    EWR|Newark Airport|         EWR|
+----------+-------+--------------+------------+
only showing top 1 row



In [45]:
(
    (df_fhv)
    .groupBy("PULocationID")
    .agg(F.count(F.expr("*")).alias("number_records"))
    .join(df_zones, F.col("LocationID") == F.col("PULocationID"))
    .orderBy("number_records")
    .show(1)
)



+------------+--------------+----------+-------+-----------+------------+
|PULocationID|number_records|LocationID|Borough|       Zone|service_zone|
+------------+--------------+----------+-------+-----------+------------+
|           2|             1|         2| Queens|Jamaica Bay|   Boro Zone|
+------------+--------------+----------+-------+-----------+------------+
only showing top 1 row



                                                                                