In [1]:
from datetime import datetime

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

24/03/09 12:24:12 WARN Utils: Your hostname, avalon resolves to a loopback address: 127.0.1.1; using 192.168.18.2 instead (on interface eth0)
24/03/09 12:24:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/09 12:24:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
paths_green = ["data/pq/green/2020/*", "data/pq/green/2021/*"]
paths_yellow = ["data/pq/yellow/2020/*", "data/pq/yellow/2021/*"]
df_green = spark.read.parquet(*paths_green)
df_yellow = spark.read.parquet(*paths_yellow)

                                                                                

### Using SQL Query

In [4]:
df_green.createOrReplaceTempView("green")
df_yellow.createOrReplaceTempView("yellow")

In [5]:
df_green_revenue = spark.sql(
    """
SELECT
    date_trunc("hour", lpep_pickup_datetime) AS hour,
    PULocationID AS zone,

    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    green
WHERE
    lpep_pickup_datetime >= "2020-01-01 00:00:00"
GROUP BY
    1, 2
ORDER BY
    1, 2
"""
)

In [6]:
df_green_revenue.show()

                                                                                

+-------------------+----+------------------+--------------+
|               hour|zone|            amount|number_records|
+-------------------+----+------------------+--------------+
|2020-01-01 00:00:00|   7| 769.7299957275391|            45|
|2020-01-01 00:00:00|  17| 195.0299997329712|             9|
|2020-01-01 00:00:00|  18| 7.800000190734863|             1|
|2020-01-01 00:00:00|  22|15.800000190734863|             1|
|2020-01-01 00:00:00|  24| 87.60000038146973|             3|
|2020-01-01 00:00:00|  25| 531.0000057220459|            26|
|2020-01-01 00:00:00|  29| 61.29999923706055|             1|
|2020-01-01 00:00:00|  32| 68.94999885559082|             2|
|2020-01-01 00:00:00|  33|317.26999831199646|            11|
|2020-01-01 00:00:00|  35| 129.9600019454956|             5|
|2020-01-01 00:00:00|  36| 295.3400011062622|            11|
|2020-01-01 00:00:00|  37|  175.669997215271|             6|
|2020-01-01 00:00:00|  38| 98.79000091552734|             2|
|2020-01-01 00:00:00|  4

### Using Pyspark API

In [7]:
df_green_revenue = (
    (df_green)
    .filter(
        F.col("lpep_pickup_datetime") >= datetime(2020, 1, 1),
    )
    .groupBy(
        [
            F.date_trunc("hour", "lpep_pickup_datetime").alias("hour"),
            F.col("PULocationID").alias("zone"),
        ]
    )
    .agg(
        F.sum("total_amount").alias("amount"),
        F.count(F.expr("*")).alias("number_records"),
    )
    .orderBy(
        "hour",
        "zone",
    )
)

In [8]:
df_green_revenue.show()



+-------------------+----+------------------+--------------+
|               hour|zone|            amount|number_records|
+-------------------+----+------------------+--------------+
|2020-01-01 00:00:00|   7| 769.7299957275391|            45|
|2020-01-01 00:00:00|  17| 195.0299997329712|             9|
|2020-01-01 00:00:00|  18| 7.800000190734863|             1|
|2020-01-01 00:00:00|  22|15.800000190734863|             1|
|2020-01-01 00:00:00|  24| 87.60000038146973|             3|
|2020-01-01 00:00:00|  25| 531.0000057220459|            26|
|2020-01-01 00:00:00|  29| 61.29999923706055|             1|
|2020-01-01 00:00:00|  32| 68.94999885559082|             2|
|2020-01-01 00:00:00|  33|317.26999831199646|            11|
|2020-01-01 00:00:00|  35| 129.9600019454956|             5|
|2020-01-01 00:00:00|  36| 295.3400011062622|            11|
|2020-01-01 00:00:00|  37|  175.669997215271|             6|
|2020-01-01 00:00:00|  38| 98.79000091552734|             2|
|2020-01-01 00:00:00|  4

                                                                                

In [9]:
df_green_revenue.repartition(20).write.parquet(
    "data/report/revenue/green", mode="overwrite"
)

                                                                                