In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("sub_review") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/14 20:19:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.bronze.reviews')
df.printSchema()
# df.show()

root
 |-- product_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thank_count: string (nullable = true)
 |-- score: string (nullable = true)
 |-- new_score: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_region: string (nullable = true)
 |-- customer_created_time: string (nullable = true)
 |-- customer_purchased: string (nullable = true)
 |-- customer_purchased_at: string (nullable = true)
 |-- customer_total_review: string (nullable = true)
 |-- customer_total_thank: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- spid: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- seller_name: string (nullable = 

In [4]:
df = df.select(
    'review_id',
    'product_id',
    'customer_id',
    'seller_id',
    'rating',
    'score',
    'new_score',
    'title',
    'content',
    'status',
    'thank_count',
    'comment_count',
    'delivery_rating',
    'timeline_review_created_date',
    'current_date',
    'timeline_content',
    'ngay_cap_nhat'
).withColumn("ngay_cap_nhat_date", F.to_date('ngay_cap_nhat'))
df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- score: string (nullable = true)
 |-- new_score: string (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thank_count: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- delivery_rating: string (nullable = true)
 |-- timeline_review_created_date: string (nullable = true)
 |-- current_date(): date (nullable = false)
 |-- timeline_content: string (nullable = true)
 |-- ngay_cap_nhat: timestamp (nullable = true)
 |-- ngay_cap_nhat_date: date (nullable = true)



In [5]:
max_date = df.agg(F.max("ngay_cap_nhat_date")).first()[0]

df = df.filter(col('ngay_cap_nhat_date') == max_date)
df = df.drop('ngay_cap_nhat_date', 'ngay_cap_nhat')

                                                                                

In [6]:
df = df.dropDuplicates()

In [7]:
df = df.withColumnRenamed("timeline_review_created_date", "created_date")

In [8]:
df = df.withColumn("rating", col("rating").cast("integer")) \
    .withColumn("score", col("score").cast("float")) \
    .withColumn("new_score", col("new_score").cast("float")) \
    .withColumn("thank_count", col("thank_count").cast("integer")) \
    .withColumn("comment_count", col("comment_count").cast("integer")) \
    .withColumn("created_date", F.to_timestamp(col("created_date"))) \
    .withColumn("current_date", F.to_timestamp(col("current_date")))

In [9]:
df = df.filter(col("review_id").isNotNull())

In [10]:
df = df.withColumn("ngay_cap_nhat", current_timestamp())

In [11]:
df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- score: float (nullable = true)
 |-- new_score: float (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thank_count: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- delivery_rating: string (nullable = true)
 |-- created_date: timestamp (nullable = true)
 |-- current_date(): date (nullable = false)
 |-- timeline_content: string (nullable = true)
 |-- current_date: timestamp (nullable = false)
 |-- ngay_cap_nhat: timestamp (nullable = false)



In [12]:
df = df.drop('current_date()')

In [13]:
df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.silver.sub_review')

                                                                                

In [14]:
spark.stop()