In [1]:
import pyspark

In [2]:
pyspark.__version__

'3.5.4'

In [3]:
pyspark.__file__

'/Users/tobi/Documents/projects/citibike-stream/citivenv/lib/python3.13/site-packages/pyspark/__init__.py'

In [4]:
from pyspark.sql import SparkSession

In [5]:
pwd

'/Users/tobi/Documents/projects/citibike-stream'

In [6]:
jdbc_driver_path = "./python_app/jars/postgresql-42.5.4.jar"

In [7]:
spark = SparkSession.builder \
         .appName("testCitiBikeStationStatusTransform") \
         .config("spark.jars", jdbc_driver_path) \
         .master("local[*]") \
         .getOrCreate()

24/12/29 11:40:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
# POSTGRESQL connection parameters
# POSTGRESQL_HOST = "postgres"
POSTGRESQL_PORT = "5432"
POSTGRES_DB = "citibike_db"
POSTGRES_USER = "citibike_user"
POSTGRES_PASSWORD = "citibike_pass"

# JDBC URL
# jdbc_url = f"jdbc:postgresql://{POSTGRESQL_HOST}:{POSTGRESQL_PORT}/{POSTGRES_DB}"

jdbc_url = f"jdbc:postgresql://localhost:{POSTGRESQL_PORT}/{POSTGRES_DB}"

# Connection properties
connection_properties = {
    "user": POSTGRES_USER,
    "password": POSTGRES_PASSWORD,
    "driver": "org.postgresql.Driver"
}

In [9]:
df = spark.read \
    .jdbc(url=jdbc_url,
          table="station_status",
          properties=connection_properties
          )

In [10]:
# Display schema
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- station_id: string (nullable = true)
 |-- num_bikes_available: integer (nullable = true)
 |-- num_docks_available: integer (nullable = true)
 |-- is_installed: boolean (nullable = true)
 |-- is_renting: boolean (nullable = true)
 |-- is_returning: boolean (nullable = true)
 |-- last_reported: timestamp (nullable = true)
 |-- inserted_at: timestamp (nullable = true)



In [11]:
df.show()

                                                                                

+---+--------------------+-------------------+-------------------+------------+----------+------------+-------------------+--------------------+
| id|          station_id|num_bikes_available|num_docks_available|is_installed|is_renting|is_returning|      last_reported|         inserted_at|
+---+--------------------+-------------------+-------------------+------------+----------+------------+-------------------+--------------------+
|  1|816e50eb-dc4b-47d...|                  0|                  0|       false|     false|       false|2024-11-05 17:22:24|2024-12-27 20:00:...|
|  2|566a6389-5c22-49c...|                  0|                  0|       false|     false|       false|2024-11-11 15:41:13|2024-12-27 20:00:...|
|  3|64f0f28c-bedc-42d...|                  0|                102|       false|     false|       false|2024-12-27 16:05:08|2024-12-27 20:00:...|
|  4|66de85d2-0aca-11e...|                 27|                 17|        true|      true|        true|2024-12-27 19:57:08|2024-12

In [12]:
df.dtypes

[('id', 'int'),
 ('station_id', 'string'),
 ('num_bikes_available', 'int'),
 ('num_docks_available', 'int'),
 ('is_installed', 'boolean'),
 ('is_renting', 'boolean'),
 ('is_returning', 'boolean'),
 ('last_reported', 'timestamp'),
 ('inserted_at', 'timestamp')]

In [13]:
from pyspark.sql import types

In [14]:
df_citibike_schema = \
    types.StructType([
        types.StructField("id", types.IntegerType(), False),
        types.StructField("station_id", types.StringType(), True),
        types.StructField("num_bikes_available", types.IntegerType(), True),
        types.StructField("num_docks_available", types.IntegerType(), True),
        types.StructField("is_installed", types.BooleanType(), True),
        types.StructField("is_renting", types.BooleanType(), True),
        types.StructField("is_returning", types.BooleanType(), True),
        types.StructField("last_reported", types.TimestampType(), True),
        types.StructField("inserted_at", types.TimestampType(), True)
    ])

In [15]:
df.createOrReplaceTempView("station_status")

In [16]:
spark.sql("SELECT * FROM station_status").show()

[Stage 1:>                                                          (0 + 1) / 1]

+---+--------------------+-------------------+-------------------+------------+----------+------------+-------------------+--------------------+
| id|          station_id|num_bikes_available|num_docks_available|is_installed|is_renting|is_returning|      last_reported|         inserted_at|
+---+--------------------+-------------------+-------------------+------------+----------+------------+-------------------+--------------------+
|  1|816e50eb-dc4b-47d...|                  0|                  0|       false|     false|       false|2024-11-05 17:22:24|2024-12-27 20:00:...|
|  2|566a6389-5c22-49c...|                  0|                  0|       false|     false|       false|2024-11-11 15:41:13|2024-12-27 20:00:...|
|  3|64f0f28c-bedc-42d...|                  0|                102|       false|     false|       false|2024-12-27 16:05:08|2024-12-27 20:00:...|
|  4|66de85d2-0aca-11e...|                 27|                 17|        true|      true|        true|2024-12-27 19:57:08|2024-12

                                                                                

In [17]:
agg_day = \
    spark.sql("""
              select
                date_trunc('day', last_reported) as date,
                count(*) as total_records
              from station_status
              group by date
              order by date desc
              """).show()

[Stage 2:>                                                          (0 + 1) / 1]

+-------------------+-------------+
|               date|total_records|
+-------------------+-------------+
|2024-12-29 00:00:00|        63829|
|2024-12-28 00:00:00|       234297|
|2024-12-27 00:00:00|       289734|
|2024-12-19 00:00:00|          266|
|2024-12-16 00:00:00|          266|
|2024-12-11 00:00:00|          266|
|2024-12-05 00:00:00|          266|
|2024-12-04 00:00:00|          266|
|2024-12-02 00:00:00|          266|
|2024-11-25 00:00:00|          266|
|2024-11-19 00:00:00|          266|
|2024-11-14 00:00:00|          266|
|2024-11-13 00:00:00|          266|
|2024-11-12 00:00:00|          798|
|2024-11-11 00:00:00|          266|
|2024-11-05 00:00:00|          266|
|2024-10-24 00:00:00|          532|
|2024-10-22 00:00:00|          266|
|2024-08-26 00:00:00|          266|
|2024-07-14 00:00:00|          266|
+-------------------+-------------+
only showing top 20 rows



                                                                                

In [18]:
check_station_id = \
    spark.sql("""
              select
                *
              from station_status
              where station_id = '1815968345907104578'
              -- group by station_id
              -- order by total_records desc
              """).show()

+-----+-------------------+-------------------+-------------------+------------+----------+------------+-------------------+--------------------+
|   id|         station_id|num_bikes_available|num_docks_available|is_installed|is_renting|is_returning|      last_reported|         inserted_at|
+-----+-------------------+-------------------+-------------------+------------+----------+------------+-------------------+--------------------+
| 1869|1815968345907104578|                  5|                 23|        true|      true|        true|2024-12-27 19:58:11|2024-12-27 20:00:...|
| 4187|1815968345907104578|                  5|                 23|        true|      true|        true|2024-12-27 19:59:29|2024-12-27 20:01:...|
| 6253|1815968345907104578|                  5|                 23|        true|      true|        true|2024-12-27 19:59:29|2024-12-27 20:02:...|
| 8663|1815968345907104578|                  5|                 23|        true|      true|        true|2024-12-27 20:01:30|