In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from delta import *

In [None]:
builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
def load_users_data(spark, path):
    schema = StructType([
        StructField("user_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("yelping_since", StringType(), True),
        StructField("useful", IntegerType(), True),
        StructField("funny", IntegerType(), True),
        StructField("cool", IntegerType(), True),
        StructField("elite", StringType(), True),
        StructField("friends", StringType(), True),
        StructField("fans", IntegerType(), True),
        StructField("average_stars", DoubleType(), True),
        StructField("compliment_hot", IntegerType(), True),
        StructField("compliment_more", IntegerType(), True),
        StructField("compliment_profile", IntegerType(), True),
        StructField("compliment_cute", IntegerType(), True),
        StructField("compliment_list", IntegerType(), True),
        StructField("compliment_note", IntegerType(), True),
        StructField("compliment_plain", IntegerType(), True),
        StructField("compliment_cool", IntegerType(), True),
        StructField("compliment_funny", IntegerType(), True),
        StructField("compliment_writer", IntegerType(), True),
        StructField("compliment_photos", IntegerType(), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_users_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_user.json")
df.show(5)


In [None]:
def handle_missing_values(df):
    numeric_cols = ["review_count", "useful", "funny", "cool", "fans", "average_stars"] + \
                  [col for col in df.columns if col.startswith("compliment_")]
    
    imputer = Imputer(
        inputCols=numeric_cols,
        outputCols=[f"{col}_imputed" for col in numeric_cols]
    ).setStrategy("median")
    
    df = imputer.fit(df).transform(df)
    
    df = df.na.fill({
        "name": "Unknown",
        "yelping_since": "1970-01-01",
        "elite": "",
        "friends": ""
    })
    return df

df = handle_missing_values(df)
df.show(5)

In [None]:
def standardize_data(df):
    return df \
        .withColumn("name", trim(lower(col("name")))) \
        .withColumn("yelping_since", to_timestamp(col("yelping_since"))) \
        .withColumn("account_age_days", datediff(current_date(), col("yelping_since"))) \
        .withColumn("friends_array", split(col("friends"), ",")) \
        .withColumn("friends_count", size(col("friends_array"))) \
        .withColumn("elite_years", split(col("elite"), ","))

df = standardize_data(df)
df.show(5)

In [None]:
def feature_engineering(df):
    df = df.withColumn("total_compliments", 
        sum([col(c) for c in df.columns if c.startswith("compliment_")]))
    
    df = df.withColumn("engagement_score", 
        (col("review_count") + col("fans") + col("friends_count")) / col("account_age_days"))
    
    df = df.withColumn("user_status",
        when(size(col("elite_years")) > 0, "Elite")
        .when(col("fans") > 10, "Popular")
        .when(col("review_count") > 50, "Active")
        .otherwise("Regular"))
    
    df = df.withColumn("rating_behavior",
        when(col("average_stars") >= 4.0, "Positive")
        .when(col("average_stars") <= 2.0, "Critical")
        .otherwise("Neutral"))
    
    return df

df = feature_engineering(df)
df.show(5)


In [None]:
def validate_data(df):
    df = df.filter(
        (col("average_stars").between(1, 5)) &
        (col("review_count") >= 0) &
        (col("fans") >= 0) &
        (length(col("user_id")) > 0)
    )
    
    numeric_cols = ["useful", "funny", "cool"] + \
                  [col for col in df.columns if col.startswith("compliment_")]
    
    for col_name in numeric_cols:
        df = df.filter(col(col_name) >= 0)
    
    return df

df = validate_data(df)
df.show(5)


In [None]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    duplicate_count = df.count() - df.dropDuplicates().count()
    
    value_dist = df.select(
        mean("review_count").alias("mean_reviews"),
        mean("fans").alias("mean_fans"),
        mean("average_stars").alias("mean_rating"),
        mean("total_compliments").alias("mean_compliments"),
        mean("engagement_score").alias("mean_engagement")
    )
    
    print("Null Counts:")
    null_counts.show()
    print(f"\nDuplicate Count: {duplicate_count}")
    print("\nValue Distributions:")
    value_dist.show()
    
    return df

df = quality_checks(df)


In [None]:
df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("user_status") \
    .save("D:/Project/delta_lake/silver/users")
