In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *


In [2]:
builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.driver.memory", "6g") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [3]:
def load_user_data(spark, path):
    schema = StructType([
        StructField("user_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("yelping_since", StringType(), True),
        StructField("useful", IntegerType(), True),
        StructField("funny", IntegerType(), True),
        StructField("cool", IntegerType(), True),
        StructField("elite", StringType(), True),
        StructField("friends", StringType(), True),
        StructField("fans", IntegerType(), True),
        StructField("average_stars", DoubleType(), True),
        StructField("compliment_hot", IntegerType(), True),
        StructField("compliment_more", IntegerType(), True),
        StructField("compliment_profile", IntegerType(), True),
        StructField("compliment_cute", IntegerType(), True),
        StructField("compliment_list", IntegerType(), True),
        StructField("compliment_note", IntegerType(), True),
        StructField("compliment_plain", IntegerType(), True),
        StructField("compliment_cool", IntegerType(), True),
        StructField("compliment_funny", IntegerType(), True),
        StructField("compliment_writer", IntegerType(), True),
        StructField("compliment_photos", IntegerType(), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_user_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_user.json")
df.show(5)


+--------------------+------+------------+-------------------+------+-----+-----+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|             user_id|  name|review_count|      yelping_since|useful|funny| cool|               elite|             friends|fans|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_writer|compliment_photos|
+--------------------+------+------------+-------------------+------+-----+-----+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|qVc8ODYU5SZjKXVBg.

In [4]:
def handle_missing_values(df):
    numeric_cols = ["review_count", "useful", "funny", "cool", "fans", "average_stars"] + \
                  [col for col in df.columns if col.startswith("compliment_")]
    
    imputer = Imputer(
        inputCols=numeric_cols,
        outputCols=[f"{col}_imputed" for col in numeric_cols]
    ).setStrategy("median")
    
    df = imputer.fit(df).transform(df)
    
    df = df.na.fill({
        "name": "Unknown",
        "yelping_since": "1970-01-01",
        "elite": "",
        "friends": ""
    })
    return df

df = handle_missing_values(df)
df.show(5)


+--------------------+------+------------+-------------------+------+-----+-----+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+--------------------+--------------+-------------+------------+------------+---------------------+----------------------+-----------------------+--------------------------+-----------------------+-----------------------+-----------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+
|             user_id|  name|review_count|      yelping_since|useful|funny| cool|               elite|             friends|fans|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_wr

In [5]:
def transform_user_data(df):
    df = df.withColumn("yelping_since", to_timestamp(col("yelping_since"))) \
        .withColumn("account_age_days", datediff(current_date(), col("yelping_since"))) \
        .withColumn("elite_years", split(col("elite"), ",")) \
        .withColumn("elite_years_count", size(col("elite_years"))) \
        .withColumn("friends_array", split(col("friends"), ",")) \
        .withColumn("friends_count", size(col("friends_array")))
    
    return df

df = transform_user_data(df)
df.show(5)


+--------------------+------+------------+-------------------+------+-----+-----+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+--------------------+--------------+-------------+------------+------------+---------------------+----------------------+-----------------------+--------------------------+-----------------------+-----------------------+-----------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+----------------+--------------------+-----------------+--------------------+-------------+
|             user_id|  name|review_count|      yelping_since|useful|funny| cool|               elite|             friends|fans|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|comp

In [6]:
def feature_engineering(df):
    # Tính tổng số compliment không dùng reduce
    compliment_cols = [c for c in df.columns if c.startswith("compliment_")]
    total_compliments_expr = F.lit(0)
    for c in compliment_cols:
        total_compliments_expr = total_compliments_expr + col(c)
    df = df.withColumn("total_compliments", total_compliments_expr)
    
    # Tính điểm engagement dựa trên review, fans và số lượng bạn bè
    df = df.withColumn("engagement_score", 
                       (col("review_count") + col("fans") + col("friends_count")) / col("account_age_days"))
    
    # Xác định trạng thái người dùng dựa trên điều kiện
    df = df.withColumn("user_status",
        when(size(col("elite_years")) > 0, "Elite")
        .when(col("fans") > 10, "Popular")
        .when(col("review_count") > 50, "Active")
        .otherwise("Regular"))
    
    # Phân loại hành vi đánh giá
    df = df.withColumn("rating_behavior",
        when(col("average_stars") >= 4.0, "Positive")
        .when(col("average_stars") <= 2.0, "Critical")
        .otherwise("Neutral"))
    
    return df


df = feature_engineering(df)
df.show(5)


+--------------------+------+------------+-------------------+------+-----+-----+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+--------------------+--------------+-------------+------------+------------+---------------------+----------------------+-----------------------+--------------------------+-----------------------+-----------------------+-----------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+----------------+--------------------+-----------------+--------------------+-------------+-----------------+-------------------+-----------+---------------+
|             user_id|  name|review_count|      yelping_since|useful|funny| cool|               elite|             friends|fans|average_stars|comp

In [7]:
def validate_data(df):
    df = df.filter(
        (col("average_stars").between(1, 5)) &
        (col("review_count") >= 0) &
        (col("fans") >= 0) &
        (length(col("user_id")) > 0)
    )
    
    numeric_cols = ["useful", "funny", "cool"] + \
                  [col for col in df.columns if col.startswith("compliment_")]
    
    for col_name in numeric_cols:
        df = df.filter(col(col_name) >= 0)
    
    return df

df = validate_data(df)
df.show(5)


+--------------------+------+------------+-------------------+------+-----+-----+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+--------------------+--------------+-------------+------------+------------+---------------------+----------------------+-----------------------+--------------------------+-----------------------+-----------------------+-----------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+----------------+--------------------+-----------------+--------------------+-------------+-----------------+-------------------+-----------+---------------+
|             user_id|  name|review_count|      yelping_since|useful|funny| cool|               elite|             friends|fans|average_stars|comp

In [8]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    user_stats = df.select(
        mean("review_count").alias("avg_reviews"),
        mean("fans").alias("avg_fans"),
        mean("average_stars").alias("avg_rating"),
        mean("total_compliments").alias("avg_compliments"),
        mean("friends_count").alias("avg_friends")
    )
    
    status_dist = df.groupBy("user_status") \
        .agg(count("*").alias("user_count")) \
        .orderBy("user_status")
    
    print("Null Counts:")
    null_counts.show()
    print("\nUser Statistics:")
    user_stats.show()
    print("\nUser Status Distribution:")
    status_dist.show()
    
    return df

df = quality_checks(df)


Null Counts:
+-------+----+------------+-------------+------+-----+----+-----+-------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+--------------------+--------------+-------------+------------+------------+---------------------+----------------------+-----------------------+--------------------------+-----------------------+-----------------------+-----------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+----------------+-----------+-----------------+-------------+-------------+-----------------+----------------+-----------+---------------+
|user_id|name|review_count|yelping_since|useful|funny|cool|elite|friends|fans|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_p

In [9]:
df = df.repartition(200)

In [None]:
try:
    df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .partitionBy("rating_behavior") \
        .save("D:/Project/delta_lake/silver/users")
except Exception as e:
    print("Error while writing Delta table:", e)
