In [1]:
import pyspark
from pyspark.sql import SparkSession, types

### Let's start a Spark Session

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/25 19:11:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Question 1. Install Spark and PySpark

> - Install Spark
> - Run PySpark
> - Create a local spark session
> - Execute spark.version

In [3]:
!spark-shell

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/25 19:11:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/25 19:11:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Spark context Web UI available at http://de-zoomcamp.europe-west1-b.c.lexical-passkey-375922.internal:4041
Spark context available as 'sc' (master = local[*], app id = local-1677352313068).
Spark session available as 'spark'.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.3.2
      /_/
         
Using Scala version 2.12.15 (OpenJDK 64-Bit Server VM, Java 11.0.2)
Type in expressions to have them evaluated.
Type :help for more information.
[35m
scala> [0m
[35m
scala> [0m

> Q1 ANSWER: **res0: String = 3.3.2**

## Question 2: HVFHW June 2021

> - Read it with Spark using the same schema as we did in the lessons.
> - We will use this dataset for all the remaining questions.
> - Repartition it to 12 partitions and save it to parquet.

> **What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)?**

### Let's convert raw data to parquet files

In [6]:
fhvhv_schema = types.StructType([
    types.StructField('dispatching_base_num', types.IntegerType(), True),
    types.StructField("pickup_datetime", types.TimestampType(), True),
    types.StructField("dropOff_datetime", types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField("SR_Flag", types.StringType(), True),    
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [7]:
range_color = ['fhvhv']
range_year = [2021]
range_month = [6]

for color in range_color:
    print(f"===== COLOR: {color} =====")
    
    for year in range_year:
        print(f"----- YEAR: {year} -----")
        
        for month in range_month:
            print(f"----- MONTH: {month} -----")
            
            try:
                print(f'processing {color} data for {year}/{month}')

                input_path = f'data/raw/{color}/{year}/{month:02d}/'
                output_path = f'data/pq/{color}/{year}/{month:02d}/'

                # df_tmp = spark.read \
                #     .option("header", "true") \
                #     .option("inferSchema" , "true") \
                #     .csv(input_path)
                
                df_tmp = spark.read \
                    .option("header", "true") \
                    .schema(eval(f"{color}_schema")) \
                    .csv(input_path)

                df_tmp \
                    .repartition(12) \
                    .write.parquet(output_path, mode='overwrite')
            except Exception as e:
                print(e)

===== COLOR: fhvhv =====
----- YEAR: 2021 -----
----- MONTH: 6 -----
processing fhvhv data for 2021/6


                                                                                

In [10]:
!ls -lh data/pq/fhvhv/2021/06/

total 275M
-rw-r--r-- 1 valkea valkea   0 Feb 25 19:15 _SUCCESS
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00000-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00001-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00002-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00003-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00004-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00005-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00006-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M Feb 25 19:15 part-00007-2cc3d426-cee8-4166-a42a-a13523754e31-c000.snappy.parquet
-rw-r--r-- 1 valkea valkea 23M

> Q2 ANSWER: **23M** ==> **24MB**

## Question 3: Count records

> How many taxi trips were there on June 15? (Consider only trips that started on June 15)

### Let's load the parquet file

In [11]:
df_fhvhv = spark.read.parquet('data/pq/fhvhv/2021/06') # data/pq/green/all_years/all_months

In [12]:
df_fhvhv.printSchema()

root
 |-- dispatching_base_num: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



### Let's make SQL requests

In [13]:
df_fhvhv.createOrReplaceTempView('fhvhv_data')

In [14]:
df_fhvhv_result = spark.sql("""
    SELECT     
        COUNT(1) AS number_records
    FROM fhvhv_data
""").show()

+--------------+
|number_records|
+--------------+
|      14961892|
+--------------+



In [17]:
df_fhvhv_result = spark.sql("""
    SELECT     
        COUNT(1) AS number_records
    FROM fhvhv_data
    WHERE pickup_datetime >= '2021-06-15 00:00:00' AND pickup_datetime < '2021-06-16 00:00:00'
""").show()

[Stage 7:>                                                          (0 + 4) / 4]

+--------------+
|number_records|
+--------------+
|        452470|
+--------------+



                                                                                

In [19]:
df_fhvhv_result = spark.sql("""
    SELECT     
        date_trunc('day', pickup_datetime) AS day,
        COUNT(1) AS number_records
    FROM fhvhv_data
    GROUP BY 1
    ORDER BY 1
""").show()

[Stage 10:>                                                         (0 + 4) / 4]

+-------------------+--------------+
|                day|number_records|
+-------------------+--------------+
|2021-06-01 00:00:00|        417375|
|2021-06-02 00:00:00|        457339|
|2021-06-03 00:00:00|        521408|
|2021-06-04 00:00:00|        538917|
|2021-06-05 00:00:00|        604903|
|2021-06-06 00:00:00|        522753|
|2021-06-07 00:00:00|        425771|
|2021-06-08 00:00:00|        462554|
|2021-06-09 00:00:00|        483353|
|2021-06-10 00:00:00|        504108|
|2021-06-11 00:00:00|        549286|
|2021-06-12 00:00:00|        591339|
|2021-06-13 00:00:00|        509039|
|2021-06-14 00:00:00|        426672|
|2021-06-15 00:00:00|        452470|
|2021-06-16 00:00:00|        479776|
|2021-06-17 00:00:00|        497133|
|2021-06-18 00:00:00|        540056|
|2021-06-19 00:00:00|        601189|
|2021-06-20 00:00:00|        491630|
+-------------------+--------------+
only showing top 20 rows



                                                                                

> Q3 ANSWER: **452470** ==> **452,470**

## Question 4: Longest trip for each day

> How long was the longest trip in Hours?

In [27]:
df_fhvhv_result = spark.sql("""
    SELECT     
        DATEDIFF(hour, pickup_datetime, dropOff_datetime) as duration_hours,
        COUNT(1) AS number_records
    FROM fhvhv_data
    GROUP BY 1
    ORDER BY 1 DESC
    LIMIT 1
""").show()

[Stage 22:>                                                         (0 + 4) / 4]

+--------------+--------------+
|duration_hours|number_records|
+--------------+--------------+
|            66|             1|
+--------------+--------------+



                                                                                

> Q4 ANSWER: **66** ==> **66.87**

## Question 5: User Interface

> Spark’s User Interface which shows application's dashboard runs on which local port?

> Q5 ANSWER: **4040**

## Question 6: Most frequent pickup location zone

> Load the zone lookup data into a temp view in Spark Zone Data

> Using the zone lookup data and the fhvhv June 2021 data, what is the name of the most frequent pickup location zone?

In [30]:
df_zone = spark.read \
    .option("header", "true") \
    .csv('data/taxi+_zone_lookup.csv')

In [31]:
df_zone.printSchema()

root
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [32]:
df_fhvhv.printSchema()

root
 |-- dispatching_base_num: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [35]:
df_join = df_fhvhv.join(df_zone, df_fhvhv.PULocationID == df_zone.LocationID)

In [38]:
df_join.printSchema()

root
 |-- dispatching_base_num: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [36]:
df_join.createOrReplaceTempView('join_data')

In [43]:
df_join_result = spark.sql("""
    SELECT     
        Zone,
        COUNT(1) AS number_records
    FROM join_data
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 1
""").show()

[Stage 45:>                                                         (0 + 4) / 4]

+-------------------+--------------+
|               Zone|number_records|
+-------------------+--------------+
|Crown Heights North|        231279|
+-------------------+--------------+



                                                                                

> Q6 ANSWER: **Crown Heights North**