In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types, functions as F

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/08/07 12:25:41 WARN Utils: Your hostname, codespaces-0918c7 resolves to a loopback address: 127.0.0.1; using 10.0.4.201 instead (on interface eth0)
24/08/07 12:25:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/07 12:25:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_green = 'taxi_data/green/pq/*/'
data_yellow = 'taxi_data/yellow/pq/*/'

In [4]:
green_schema = types.StructType([
    types.StructField('VendorID', types.IntegerType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('store_and_fwd_flag', types.StringType(), True),
    types.StructField('RatecodeID', types.IntegerType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('passenger_count', types.IntegerType(), True),
    types.StructField('trip_distance', types.DoubleType(), True),
    types.StructField('fare_amount', types.DoubleType(), True),
    types.StructField('extra', types.DoubleType(), True),
    types.StructField('mta_tax', types.DoubleType(), True),
    types.StructField('tip_amount', types.DoubleType(), True),
    types.StructField('tolls_amount', types.DoubleType(), True),
    types.StructField('ehail_fee', types.DoubleType(), True),
    types.StructField('improvement_surcharge', types.DoubleType(), True),
    types.StructField('total_amount', types.DoubleType(), True),
    types.StructField('payment_type', types.IntegerType(), True),
    types.StructField('trip_type', types.IntegerType(), True),
    types.StructField('congestion_surcharge', types.DoubleType(), True)
])

In [5]:
df_green = spark.read \
    .schema(green_schema) \
    .parquet(data_green)

In [6]:
df_yellow = spark.read \
    .parquet(data_yellow)

                                                                                

In [7]:
df_yellow = df_yellow \
    .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
    .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

In [8]:
df_green.createOrReplaceTempView('green_data')

In [9]:
df_yellow.createOrReplaceTempView('yellow_data')

In [10]:
df_result_green = spark.sql("""
SELECT 
    PULocationID AS zone,
    date_trunc('hour', pickup_datetime) AS revenue_hour, 

    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    green_data
WHERE pickup_datetime > '2020-01-01'
GROUP BY
    zone,revenue_hour
""")

In [11]:
df_result_yellow = spark.sql("""
SELECT 
    PULocationID AS zone,
    date_trunc('hour', pickup_datetime) AS revenue_hour, 

    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    yellow_data
WHERE pickup_datetime > '2020-01-01'
GROUP BY
    zone,revenue_hour
""")

In [None]:
# df_result_yellow.show()

In [None]:
# df_result_green.write.parquet('taxi_data/report/revenue/green', mode='overwrite')

In [None]:
# df_result_yellow \
#     .repartition(20) \
#     .write.parquet('taxi_data/report/revenue/yellow', mode='overwrite')

                                                                                

In [15]:
df_result_green = df_result_green \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_records', 'green_number_records')

df_result_yellow = df_result_yellow \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_records', 'yellow_number_records')

In [16]:
df_join =df_result_green.join(df_result_yellow, on=['revenue_hour', 'zone'], how='outer')

In [18]:
df_join \
    .repartition(20) \
    .write.parquet('taxi_data/report/revenue/total', mode='overwrite')

                                                                                

In [19]:
df_join = spark.read.parquet('taxi_data/report/revenue/total')
df_zones = spark.read.parquet('taxi_data/zones')

In [21]:
df_result = df_join.join(df_zones, df_join.zone == df_zones.LocationID)
df_result.drop('LocationID')

DataFrame[revenue_hour: timestamp, zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint, Borough: string, Zone: string, service_zone: string]

In [23]:
df_result.drop('zone', 'LocationID').write.parquet('taxi_data/temp/revenue-zones')

                                                                                