In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("test_notebook") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/05 06:53:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/05 06:53:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.bronze.amazon_products')

In [4]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- description: string (nullable = true)
 |-- initial_price: string (nullable = true)
 |-- final_price: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- reviews_count: integer (nullable = true)
 |-- categories: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- buybox_seller: string (nullable = true)
 |-- number_of_sellers: integer (nullable = true)
 |-- root_bs_rank: integer (nullable = true)
 |-- answered_questions: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- images_count: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- video_count: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- item_weight: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- product_dimensi

In [5]:
df_clean = df \
    .withColumn("timestamp", F.to_timestamp("timestamp")) \
    .withColumn("initial_price", F.regexp_replace("initial_price", "[^0-9.]", "").cast(DoubleType())) \
    .withColumn("final_price", F.regexp_replace("final_price", "[^0-9.]", "").cast(DoubleType())) \
    .withColumn("reviews_count", F.col("reviews_count").cast(IntegerType())) \
    .withColumn("number_of_sellers", F.col("number_of_sellers").cast(IntegerType())) \
    .withColumn("root_bs_rank", F.col("root_bs_rank").cast(IntegerType())) \
    .withColumn("answered_questions", F.col("answered_questions").cast(IntegerType())) \
    .withColumn("images_count", F.col("images_count").cast(IntegerType())) \
    .withColumn("video_count", F.col("video_count").cast(IntegerType())) \
    .withColumn("rating", F.col("rating").cast(DoubleType())) \
    .withColumn("bought_past_month", F.col("bought_past_month").cast(IntegerType())) \
    .withColumn("is_available", F.col("is_available").cast(BooleanType())) \
    .withColumn("amazon_choice", F.col("amazon_choice").cast(BooleanType())) \
    .withColumn("plus_content", F.col("plus_content").cast(BooleanType())) \
    .withColumn("bs_rank", F.col("bs_rank").cast(IntegerType()))

In [None]:
# Thay null bằng default value
df_clean = df_clean.fillna({
    "reviews_count": 0,
    "rating": 0.0,
    "is_available": False,
    "amazon_choice": False,
    "plus_content": False,
    "number_of_sellers": 1
})

# Chuẩn hóa text: loại khoảng trắng thừa
text_cols = ["title", "seller_name", "brand", "description", "categories", "asin", "buybox_seller"]
for col in text_cols:
    df_clean = df_clean.withColumn(col, F.trim(F.col(col)))


In [None]:
import json
from pyspark.sql.types import ArrayType, StringType

def parse_json_column(col_name):
    return F.udf(lambda x: json.loads(x) if x else [], ArrayType(StringType()))

df_clean = df_clean.withColumn("categories_list", parse_json_column("categories")("categories"))
df_clean = df_clean.withColumn("features_list", parse_json_column("features")("features"))


In [None]:
# Giá giảm
df_clean = df_clean.withColumn("discount_amount", F.col("initial_price") - F.col("final_price"))
df_clean = df_clean.withColumn("discount_percent", F.when(F.col("initial_price") > 0,
                                                           (F.col("discount_amount")/F.col("initial_price"))*100).otherwise(0))


In [None]:
df_clean.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.silver.amazon_products_clean')

In [None]:
spark.stop()