In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, length
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType, TimestampType
import time 

schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("content", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("favourites", LongType(), True),
    StructField("reblogs", LongType(), True),
    StructField("hashtags", ArrayType(StringType()), True)
])

def store_historical_data_to_postgres(spark: SparkSession, jdbc_url: str, connection_properties: dict):
    kafka_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:9092") \
        .option("subscribe", "mastodonStream") \
        .load()
    
    parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
        .select(from_json(col("value"), schema).alias("data")) \
        .select("data.*")

    filtered_df = parsed_df.filter(col("content").contains("AI"))

    query = filtered_df.writeStream \
        .outputMode("append") \
        .foreachBatch(lambda df, epochId: df.write.jdbc(
            url=jdbc_url, 
            table="toot_data",
            mode="append", 
            properties=connection_properties)) \
        .start()

    query.awaitTermination()

# Example usage
if __name__ == "__main__":
    spark = SparkSession.builder \
        .appName("Mastodon Data Ingestion") \
        .config("spark.jars.packages", 
                "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3,"
                "org.apache.kafka:kafka-clients:3.3.1,"
                "org.postgresql:postgresql:42.2.18") \
        .getOrCreate()
    
    
    jdbc_url = "jdbc:postgresql://postgres:5432/postgres"
    connection_properties = {
        "user": "postgres",
        "password": "postgres",
        "driver": "org.postgresql.Driver"
    }

    store_historical_data_to_postgres(spark, jdbc_url, connection_properties)
