In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/24 16:29:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Let's prepare the Green revenue report

In [3]:
df_green = spark.read.parquet('data/pq/green/*/*') # data/pq/green/all_years/all_months

                                                                                

In [4]:
df_green.createOrReplaceTempView('green_data')

In [5]:
df_green_result = spark.sql("""
    SELECT 
    
    date_trunc('hour', lpep_pickup_datetime) AS hour, 
    PULocationID AS zone,
    
    SUM(total_amount) AS amount,
    COUNT(1) AS number_records

    FROM green_data
    WHERE lpep_pickup_datetime >= '2020-01-01 00:00:00'
    GROUP BY 1,2
    ORDER BY 1,2
""")

In [6]:
df_green_result \
    .repartition(20) \
    .write.parquet('data/report/revenue/green/', mode='overwrite')

                                                                                

#### Let's prepare the Yellow revenue report

In [7]:
df_yellow = spark.read.parquet('data/pq/yellow/*/*') # data/pq/yellow/all_years/all_months

In [8]:
df_yellow.createOrReplaceTempView('yellow_data')

In [9]:
df_yellow_result = spark.sql("""
    SELECT 
    
    date_trunc('hour', tpep_pickup_datetime) AS hour, 
    PULocationID AS zone,
    
    SUM(total_amount) AS amount,
    COUNT(1) AS number_records

    FROM yellow_data
    WHERE tpep_pickup_datetime >= '2020-01-01 00:00:00'
    GROUP BY 1,2
    ORDER BY 1,2
""")

In [10]:
df_yellow_result \
    .repartition(20) \
    .write.parquet('data/report/revenue/yellow/', mode='overwrite')

                                                                                

#### Let's play with JOINS

In [24]:
df_green_revenue = spark.read.parquet('data/report/revenue/green/')
df_yellow_revenue = spark.read.parquet('data/report/revenue/yellow/')

In [25]:
df_join_green = df_green_revenue \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_records', 'green_number_records')

df_join_yellow = df_yellow_revenue \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_records', 'yellow_number_records')

In [26]:
df_join = df_join_green.join(df_join_yellow, on=['hour', 'zone'], how='outer')

In [27]:
df_join.show()



+-------------------+----+------------------+--------------------+------------------+---------------------+
|               hour|zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|
+-------------------+----+------------------+--------------------+------------------+---------------------+
|2020-01-01 00:00:00|   3|              null|                null|              25.0|                    1|
|2020-01-01 00:00:00|   4|              null|                null|1004.3000000000002|                   57|
|2020-01-01 00:00:00|   7| 769.7299999999996|                  45| 455.1700000000001|                   38|
|2020-01-01 00:00:00|  12|              null|                null|             107.0|                    6|
|2020-01-01 00:00:00|  37|            175.67|                   6|161.60999999999999|                    7|
|2020-01-01 00:00:00|  40|168.97999999999996|                   8|             89.97|                    5|
|2020-01-01 00:00:00|  45|  

[Stage 56:>                                                         (0 + 1) / 1]                                                                                

In [28]:
df_join \
    .repartition(20) \
    .write.parquet('data/report/revenue/total', mode='overwrite')

                                                                                

### XXX

In [42]:
df_join_read = spark.read.parquet('data/report/revenue/total/')

In [43]:
df_zones = spark.read.parquet('data/zones/')

In [44]:
df_zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [51]:
df_join2 = df_join_read.join(df_zones, df_zones.LocationID == df_join_read.zone, how='inner')

In [52]:
df_join2.drop('LocationID').show()

+-------------------+----+------------+--------------------+------------------+---------------------+---------+--------------------+------------+
|               hour|zone|green_amount|green_number_records|     yellow_amount|yellow_number_records|  Borough|                Zone|service_zone|
+-------------------+----+------------+--------------------+------------------+---------------------+---------+--------------------+------------+
|2021-07-23 05:00:00| 107|        null|                null|            266.18|                   13|Manhattan|            Gramercy| Yellow Zone|
|2020-08-01 05:00:00| 259|        null|                null|              73.5|                    1|    Bronx|  Woodlawn/Wakefield|   Boro Zone|
|2020-10-30 13:00:00| 262|        null|                null|           1170.12|                   72|Manhattan|      Yorkville East| Yellow Zone|
|2020-10-10 00:00:00| 164|        null|                null| 646.6199999999999|                   36|Manhattan|       Midtow