In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# pyspark.__version__
# pyspark.__file__
# pwd

In [3]:
jdbc_driver_path = "./python_app/jars/postgresql-42.5.4.jar"

spark = SparkSession.builder \
    .appName("testCitiBikeStationStatusTransform") \
    .config("spark.jars", jdbc_driver_path) \
    .config("spark.driver.memory", "8g") \
    .config("spark.memory.fraction", "0.8") \
    .master("local[4]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("INFO")

24/12/31 13:47:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# POSTGRESQL connection parameters
# POSTGRESQL_HOST = "postgres"
POSTGRESQL_PORT = "5432"
POSTGRES_DB = "citibike_db"
POSTGRES_USER = "citibike_user"
POSTGRES_PASSWORD = "citibike_pass"

# JDBC URL
# jdbc_url = f"jdbc:postgresql://{POSTGRESQL_HOST}:{POSTGRESQL_PORT}/{POSTGRES_DB}"

jdbc_url = f"jdbc:postgresql://localhost:{POSTGRESQL_PORT}/{POSTGRES_DB}"

# Connection properties
connection_properties = {
    "user": POSTGRES_USER,
    "password": POSTGRES_PASSWORD,
    "driver": "org.postgresql.Driver"
}

In [None]:
df_station_status = spark.read \
    .jdbc(url=jdbc_url,
          table="station_status",
          properties=connection_properties
          )
df_station_status.printSchema()

In [None]:
df_station_info = spark.read \
    .jdbc(url=jdbc_url,
            table="station_information",
            properties=connection_properties
            )
df_station_info.printSchema()

In [5]:
df_tripdata = spark.read \
    .jdbc(url=jdbc_url,
            table="tripdata",
            properties=connection_properties
            )
df_tripdata.printSchema()

24/12/31 13:48:03 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
24/12/31 13:48:03 INFO SharedState: Warehouse path is 'file:/Users/tobi/Documents/projects/citibike-stream/spark-warehouse'.


root
 |-- ride_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- start_lat: decimal(9,6) (nullable = true)
 |-- start_lng: decimal(9,6) (nullable = true)
 |-- end_lat: decimal(9,6) (nullable = true)
 |-- end_lng: decimal(9,6) (nullable = true)
 |-- member_casual: string (nullable = true)
 |-- month: date (nullable = true)



In [None]:
df_station_status.show(5)

In [None]:
df_station_info.show(5)

In [6]:
df_tripdata.show(5)

24/12/31 13:48:22 INFO CodeGenerator: Code generated in 219.462542 ms
24/12/31 13:48:22 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
24/12/31 13:48:22 INFO DAGScheduler: Got job 0 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
24/12/31 13:48:22 INFO DAGScheduler: Final stage: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0)
24/12/31 13:48:22 INFO DAGScheduler: Parents of final stage: List()
24/12/31 13:48:22 INFO DAGScheduler: Missing parents: List()
24/12/31 13:48:22 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[2] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
24/12/31 13:48:22 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 18.2 KiB, free 6.2 GiB)
24/12/31 13:48:23 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 7.9 KiB, free 6.2 GiB)
24/12/31 13:48:23 INFO BlockManagerInfo: Added broadcast_0_pie

+----------------+-------------+-------------------+-------------------+--------------------+----------------+--------------------+--------------+---------+----------+---------+----------+-------------+----------+
|         ride_id|rideable_type|         started_at|           ended_at|  start_station_name|start_station_id|    end_station_name|end_station_id|start_lat| start_lng|  end_lat|   end_lng|member_casual|     month|
+----------------+-------------+-------------------+-------------------+--------------------+----------------+--------------------+--------------+---------+----------+---------+----------+-------------+----------+
|3F874FD7056276BA|electric_bike|2024-10-31 00:00:00|2024-11-01 00:00:00|    W 30 St & 10 Ave|         6459.07|Cleveland Pl & Sp...|       5492.05|40.752694|-74.002353|40.722104|-73.997249|       member|2024-10-01|
|E4FE320A5D6A8901| classic_bike|2024-10-31 00:00:00|2024-11-01 00:00:00|Sullivan St & Was...|         5721.01|Cleveland Pl & Sp...|       5492.0

24/12/31 13:48:29 INFO CodeGenerator: Code generated in 24.863375 ms


In [7]:
df_tripdata.dtypes

[('ride_id', 'string'),
 ('rideable_type', 'string'),
 ('started_at', 'timestamp'),
 ('ended_at', 'timestamp'),
 ('start_station_name', 'string'),
 ('start_station_id', 'string'),
 ('end_station_name', 'string'),
 ('end_station_id', 'string'),
 ('start_lat', 'decimal(9,6)'),
 ('start_lng', 'decimal(9,6)'),
 ('end_lat', 'decimal(9,6)'),
 ('end_lng', 'decimal(9,6)'),
 ('member_casual', 'string'),
 ('month', 'date')]

In [None]:
from pyspark.sql import types

In [None]:
df_citibike_schema = \
    types.StructType([
        types.StructField("id", types.IntegerType(), False),
        types.StructField("station_id", types.StringType(), True),
        types.StructField("num_bikes_available", types.IntegerType(), True),
        types.StructField("num_docks_available", types.IntegerType(), True),
        types.StructField("is_installed", types.BooleanType(), True),
        types.StructField("is_renting", types.BooleanType(), True),
        types.StructField("is_returning", types.BooleanType(), True),
        types.StructField("last_reported", types.TimestampType(), True),
        types.StructField("inserted_at", types.TimestampType(), True)
    ])

In [None]:
df.createOrReplaceTempView("station_status")

In [None]:
spark.sql("SELECT * FROM station_status").show()

In [None]:
agg_day = \
    spark.sql("""
              select
                date_trunc('day', last_reported) as date,
                count(*) as total_records
              from station_status
              group by date
              order by date desc
              """).show()

In [None]:
check_station_id = \
    spark.sql("""
              select
                *
              from station_status
              where station_id = '1815968345907104578'
              -- group by station_id
              -- order by total_records desc
              """).show()