In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("fact_review_product") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/23 18:44:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_review = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_review') 

df_review = df_review.withColumnRenamed('created_date', 'review_date')

In [4]:
df_product = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.dm_product') \
    .select(
        'product_id',
        'name'
    )
# df_product.printSchema()

In [5]:
df = df_review.join(df_product, df_review['product_id'] == df_product['product_id'], 'inner').drop(df_product['product_id'])
df = df.select(
    df_review['*'],
    df_product['name'].alias('product_name')
)
df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- score: float (nullable = true)
 |-- new_score: float (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thank_count: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- delivery_rating: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- timeline_content: string (nullable = true)
 |-- current_date: timestamp (nullable = true)
 |-- ngay_cap_nhat: timestamp (nullable = true)
 |-- product_name: string (nullable = true)



In [6]:
group_column = ['product_id', 'product_name']
df = df.groupBy(*group_column) \
    .agg(
        F.count(col('review_id')).alias('total_review'),
        F.avg(col('rating')).alias('avg_rating'),
        F.sum(when(col("rating") <= 2, 1).otherwise(0)).alias('negative_review_count'),
        (F.sum(when(col("rating") <= 2, 1).otherwise(0)) / F.count(col('review_id')) * 100).alias('negative_review_rate'),
         F.sum(when(col("rating") > 2, 1).otherwise(0)).alias('positive_review_count'),
        (F.sum(when(col("rating") > 2, 1).otherwise(0)) / F.count(col('review_id')) * 100).alias('positive_review_rate')
    )
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- total_review: long (nullable = false)
 |-- avg_rating: double (nullable = true)
 |-- negative_review_count: long (nullable = true)
 |-- negative_review_rate: double (nullable = true)
 |-- positive_review_count: long (nullable = true)
 |-- positive_review_rate: double (nullable = true)



In [7]:
df.show()

                                                                                

+----------+--------------------+------------+-----------------+---------------------+--------------------+---------------------+--------------------+
|product_id|        product_name|total_review|       avg_rating|negative_review_count|negative_review_rate|positive_review_count|positive_review_rate|
+----------+--------------------+------------+-----------------+---------------------+--------------------+---------------------+--------------------+
|  41161743|Pons General Refe...|          70|4.857142857142857|                    2|   2.857142857142857|                   68|   97.14285714285714|
| 222680673|Sổ Còng Sổ Tay A5...|         399|4.779448621553884|                    9|  2.2556390977443606|                  390|   97.74436090225564|
|  88998879|Sách Châu Nhuận P...|         323|4.854489164086687|                    5|  1.5479876160990713|                  318|   98.45201238390094|
|   5154511|Sách Bà Nội Găngx...|         574| 4.86411149825784|                    5|  0.8710

In [9]:
df = df.withColumn("ngay_cap_nhat", current_timestamp())

df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.gold.fact_review_product')

                                                                                

In [10]:
spark.stop()