In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
import os


In [None]:
config_file = "../config/postgres_connection.txt"

with open(config_file, "r") as f:
    lines = f.read().splitlines()

pg_config = {line.split("=")[0]: line.split("=")[1] for line in lines}

jdbc_url = f"jdbc:postgresql://{pg_config['host']}:{pg_config['port']}/{pg_config['database']}"
jdbc_properties = {
    "user": pg_config["user"],
    "password": pg_config["password"],
    "driver": "org.postgresql.Driver"
}


In [None]:
spark = SparkSession.builder \
    .appName("EcommerceStreaming") \
    .getOrCreate()


In [None]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath('')))
csv_path = os.path.join(BASE_DIR, "data", "raw")

df_stream = spark.readStream \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(csv_path)

df_stream.printSchema()


In [None]:
df_transformed = df_stream \
    .withColumn("timestamp", to_timestamp(col("timestamp"))) \
    .withColumn("price", col("price").cast("double")) \
    .withColumn("user_id", col("user_id").cast("int")) \
    .withColumn("product_id", col("product_id").cast("int"))


In [None]:
def write_to_postgres(batch_df, batch_id):
    batch_df.write.jdbc(
        url=jdbc_url,
        table="user_events",
        mode="append",
        properties=jdbc_properties
    )


In [None]:
query = df_transformed.writeStream \
    .foreachBatch(write_to_postgres) \
    .outputMode("append") \
    .trigger(processingTime="5 seconds") \
    .option("checkpointLocation", os.path.join(BASE_DIR, "checkpoint")) \
    .start()

query.awaitTermination()
