In [2]:
# pip install jupyter_contrib_nbextensions
# pip install pyspark
# pip install pandas
# pip install confluent_kafka

In [48]:
from pyspark.sql import SparkSession
from pyspark.sql import functions 
from pyspark.sql.functions import col
from confluent_kafka.admin import AdminClient, NewTopic
from confluent_kafka import Producer
import pandas as pd
import os
import json

In [30]:
jdbcUrl = "jdbc:postgresql://test-postgresql-1:5432/postgres"
connectionProperties = {
  "user": "postgres",
  "password": "",
  "driver": 'org.postgresql.Driver' # Specify the driver class
}
# df = spark.read.jdbc(jdbcUrl, "public.products", properties=connectionProperties)
# products.write.mode("overwrite") .jdbc(jdbcUrl, "public.products", properties=connectionProperties)

In [None]:
KAFKA_HOST = "127.0.0.1:9093"
data = {
    "products" : pd.read_csv("./data/products.csv"),
    # "orders"   : pd.read_csv("./data/orders.csv")
}

admin = AdminClient({ "bootstrap.servers" : KAFKA_HOST })

for my_topic, df in data.items():

    
    #### delete topic
    print(f"Deleting topic {my_topic} if exists")
    ops = admin.delete_topics([my_topic], operation_timeout=30)
    for topic, f in ops.items():
        try:
            f.result()
            print(f"Topic {topic} is deleted")
        except Exception as e:
            print(f"Failed to delete topic {topic}: {e}")


    
    #### recreate topic
    print(f"Creating topic {my_topic}")
    new_topics = [NewTopic(my_topic, num_partitions=3, replication_factor=1)]
    ops = admin.create_topics(new_topics)
    
    for topic, f in ops.items():
        try:
            f.result()
            print(f"Topic {topic} is created")
        except Exception as e:
            print(f"Failed to create topic {topic}: {e}")



    # import data to Kafka
    producer = Producer({ "bootstrap.servers" : KAFKA_HOST })
    print(f"Importing {my_topic} data")

    cnt = 0
    batch_size = 5e4
    for _, row in df.iterrows():
        if cnt % batch_size==0 and cnt!=0:
            producer.flush()
            print(f"{cnt} message is imported")
        producer.produce(my_topic, json.dumps(row.to_dict()))
        cnt+=1
    producer.flush()

In [4]:
spark = (
    SparkSession.builder
    .appName("cluster-read-s3")
    # s3
    .config("spark.hadoop.fs.s3a.endpoint", "http://s3:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "miniopassword")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # iceberg
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.iceberg.type", "hadoop")
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://lmwn/iceberg")

    .getOrCreate()
)


In [5]:
orders = spark.read.option('header','true').csv('s3a://lmwn/data/orders.csv').alias('o')

In [6]:
products = spark.read.option('header','true').csv('s3a://lmwn/data/products.csv').alias('p')

In [11]:
order_product = orders.join(products,col('o.product_id')==col('p.product_id'),'left').select( col('order_id'),
                                                                                                  col('order_date'),
                                                                                                  col('user_id'),
                                                                                                  col('o.product_id'),
                                                                                                  col('quantity'),
                                                                                                  col('status'),
                                                                                                  col('product_name'),
                                                                                                  col('price'),
                                                                                                  col('category'))

In [45]:
order_product.write.mode('overwrite').csv('./data/order_product.csv')

In [26]:
spark.sql("""
select user_id,
        sum(price*quantity) as total_revenue 
from (
        select order_id,product_id,user_id,quantity 
        from iceberg.orders 
        where user_id = 'USER-001' and status = 'COMPLETE') orders 
left join (
        select product_id,price 
        from iceberg.products) prod 
on orders.product_id = prod.product_id
group by user_id;

""").show()

+--------+-----------------+
| user_id|    total_revenue|
+--------+-----------------+
|USER-001|521671.7600000001|
+--------+-----------------+



In [42]:
df = spark.read \
    .jdbc(jdbcUrl, "public.products", properties=connectionProperties)

In [41]:
products.write.mode("overwrite") .jdbc(jdbcUrl, "public.products", properties=connectionProperties)

In [43]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: string (nullable = true)
 |-- category: string (nullable = true)

