**Feature Engineering**


In [0]:
%run "/Workspace/Users/sundarasandeepteja@gmail.com/E-Commerce Analytics Medallion Architecture with GenAI/config/project_config"

In [0]:
# Databricks notebook source
# ======================================
# ML: FEATURE ENGINEERING FOR RECOMMENDATIONS
# ======================================

# MAGIC %run ../config/project_config

from pyspark.sql import functions as F
from pyspark.sql.window import Window

print("ü§ñ ML: Feature Engineering for Recommendation Engine")
print("=" * 60)

# ======================================
# STEP 1: LOAD SOURCE DATA
# ======================================
print("\nüì• Step 1: Loading source data...")

transactions = spark.table(SILVER_TRANSACTIONS_TABLE).filter(F.col("status") == "Completed")
print(f"  Transactions: {transactions.count():,}")

ratings = spark.table(f"{SILVER_DB}.ratings_validated")
print(f"  Ratings: {ratings.count():,}")

products = spark.table(GOLD_DIM_PRODUCTS_TABLE)
print(f"  Products: {products.count():,}")

customers = spark.table(GOLD_DIM_CUSTOMERS_TABLE)
print(f"  Customers: {customers.count():,}")

# ======================================
# STEP 2: CREATE PURCHASE FEATURES
# ======================================
print("\nüõí Step 2: Creating purchase features...")

purchase_features = transactions.groupBy("customer_id", "product_id").agg(
    F.count("transaction_id").alias("purchase_count"),
    F.sum("quantity").alias("total_quantity"),
    F.sum("final_amount").alias("total_spent"),
    F.avg("final_amount").alias("avg_order_value"),
    F.min("transaction_date").alias("first_purchase_date"),
    F.max("transaction_date").alias("last_purchase_date"),
    F.countDistinct("transaction_date").alias("purchase_days")
)
print(f"  Purchase pairs: {purchase_features.count():,}")

# ======================================
# STEP 3: CREATE RATING FEATURES
# ======================================
print("\n‚≠ê Step 3: Creating rating features...")

rating_window = Window.partitionBy("customer_id", "product_id").orderBy(F.desc("rating_date"))
rating_features = ratings.withColumn("rn", F.row_number().over(rating_window)).filter(F.col("rn") == 1).select(
    "customer_id", "product_id", "rating", "has_review", "helpful_votes", "sentiment", "rating_date"
)
print(f"  Rating pairs: {rating_features.count():,}")

# ======================================
# STEP 4: COMBINE INTERACTIONS
# ======================================
print("\nüîó Step 4: Combining purchase and rating data...")

interactions = purchase_features.join(rating_features, on=["customer_id", "product_id"], how="full_outer")
interactions = interactions.fillna({
    "purchase_count": 0,
    "total_quantity": 0,
    "total_spent": 0,
    "rating": 0,
    "has_review": False,
    "helpful_votes": 0
})
print(f"  Total interactions: {interactions.count():,}")

# ======================================
# STEP 5: CREATE IMPLICIT SCORE
# ======================================
print("\nüìä Step 5: Creating implicit score...")

interactions = interactions.withColumn(
    "purchase_score", F.least(F.col("purchase_count"), F.lit(10)) / 10 * 5
).withColumn(
    "recency_score",
    F.when(F.col("last_purchase_date").isNotNull(),
        F.when(F.datediff(F.current_date(), F.col("last_purchase_date")) <= 30, 5)
         .when(F.datediff(F.current_date(), F.col("last_purchase_date")) <= 90, 4)
         .when(F.datediff(F.current_date(), F.col("last_purchase_date")) <= 180, 3)
         .when(F.datediff(F.current_date(), F.col("last_purchase_date")) <= 365, 2)
         .otherwise(1)
    ).otherwise(0)
).withColumn(
    "quantity_score", F.least(F.col("total_quantity"), F.lit(20)) / 20 * 5
)

interactions = interactions.withColumn(
    "implicit_score",
    F.when(F.col("rating") > 0,
        F.col("rating") * 0.4 +
        F.col("purchase_score") * 0.3 +
        F.col("recency_score") * 0.2 +
        F.col("quantity_score") * 0.1
    ).otherwise(
        F.col("purchase_score") * 0.5 +
        F.col("recency_score") * 0.3 +
        F.col("quantity_score") * 0.2
    )
)

interactions = interactions.withColumn(
    "implicit_score", F.greatest(F.least(F.col("implicit_score"), F.lit(5)), F.lit(0.5))
)
print("  ‚úÖ Implicit score calculated")

# ======================================
# STEP 6: CREATE NUMERIC IDS FOR ALS (HASH-BASED)
# ======================================
print("\nüî¢ Step 6: Creating numeric IDs for ALS...")

interactions = interactions.withColumn(
    "customer_idx", F.abs(F.hash(F.col("customer_id")))
).withColumn(
    "product_idx", F.abs(F.hash(F.col("product_id")))
)
print(f"  Unique customers: {interactions.select('customer_idx').distinct().count():,}")
print(f"  Unique products: {interactions.select('product_idx').distinct().count():,}")

ml_features = interactions

# ======================================
# STEP 7: ADD CONTEXTUAL FEATURES
# ======================================
print("\nüè∑Ô∏è Step 7: Adding contextual features...")

ml_features = ml_features.join(
    customers.select("customer_id", "clv_segment", "region", "age_group"),
    on="customer_id",
    how="left"
)
ml_features = ml_features.join(
    products.select("product_id", "category", "price_tier", "brand"),
    on="product_id",
    how="left"
)
print("  ‚úÖ Context features added")

# ======================================
# STEP 8: SAVE FEATURES AND MAPPINGS
# ======================================
print("\nüíæ Step 8: Saving features and mappings...")

final_features = ml_features.select(
    "customer_id", "product_id", "customer_idx", "product_idx",
    "implicit_score", "rating", "purchase_score", "recency_score", "quantity_score",
    "purchase_count", "total_quantity", "total_spent",
    "clv_segment", "region", "age_group", "category", "price_tier", "brand",
    "first_purchase_date", "last_purchase_date"
)

final_features.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_DB}.ml_interaction_features")
print(f"  ‚úÖ Saved: {SILVER_DB}.ml_interaction_features")

customer_mapping = ml_features.select("customer_id", "customer_idx").distinct()
customer_mapping.write.format("delta").mode("overwrite").save(f"{ML_PATH}/customer_mapping")
print(f"  ‚úÖ Saved: {ML_PATH}/customer_mapping")

product_mapping = ml_features.select("product_id", "product_idx").distinct()
product_mapping.write.format("delta").mode("overwrite").save(f"{ML_PATH}/product_mapping")
print(f"  ‚úÖ Saved: {ML_PATH}/product_mapping")

# ======================================
# STEP 9: SUMMARY STATISTICS
# ======================================
print("\nüìä Step 9: Feature Summary...")

display(final_features.select("implicit_score").summary())

display(
    final_features.groupBy("clv_segment")
    .agg(
        F.count("*").alias("interactions"),
        F.round(F.avg("implicit_score"), 2).alias("avg_score")
    )
    .orderBy(F.desc("interactions"))
)

display(
    final_features.groupBy("category")
    .agg(
        F.count("*").alias("interactions"),
        F.round(F.avg("implicit_score"), 2).alias("avg_score")
    )
    .orderBy(F.desc("interactions"))
)

print("\n" + "=" * 60)
print("ü§ñ FEATURE ENGINEERING COMPLETE!")
print("=" * 60)

**Model Training with MLflow**

In [0]:
# Databricks notebook source
# ======================================
# ML: TRAIN ALS RECOMMENDATION MODEL
# ======================================

# MAGIC %run ../config/project_config

from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark
from datetime import datetime

print("ü§ñ ML: Training ALS Recommendation Model")
print("=" * 60)

# ======================================
# STEP 1: LOAD ML FEATURES
# ======================================
print("\nüì• Step 1: Loading ML features...")

ml_data = spark.table(f"{SILVER_DB}.ml_interaction_features")

als_data = ml_data.select(
    "customer_idx",
    "product_idx", 
    "implicit_score",
    "customer_id",
    "product_id"
).filter(
    F.col("customer_idx").isNotNull() &
    F.col("product_idx").isNotNull() &
    F.col("implicit_score").isNotNull()
)

print(f"  Total interactions: {als_data.count():,}")
print(f"  Unique customers: {als_data.select('customer_idx').distinct().count():,}")
print(f"  Unique products: {als_data.select('product_idx').distinct().count():,}")

# ======================================
# STEP 2: TRAIN/TEST SPLIT
# ======================================
print("\nüìä Step 2: Splitting data...")

train_data, test_data = als_data.randomSplit(
    [TRAIN_TEST_SPLIT, 1 - TRAIN_TEST_SPLIT], 
    seed=RANDOM_SEED
)

print(f"  Training set: {train_data.count():,}")
print(f"  Test set: {test_data.count():,}")

# Remove cache for serverless
# train_data.cache()
# test_data.cache()

# ======================================
# STEP 3: SETUP MLFLOW
# ======================================
print("\nüî¨ Step 3: Setting up MLflow...")

experiment_name = "/ecommerce-recommendation-engine"
mlflow.set_experiment(experiment_name)

print(f"  Experiment: {experiment_name}")

# ======================================
# STEP 4: TRAIN ALS MODEL
# ======================================
print("\nüèãÔ∏è Step 4: Training ALS model...")

# ... (previous code unchanged)
if mlflow.active_run():
    mlflow.end_run()
run_name = f"als_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_param("algorithm", "ALS")
    mlflow.log_param("max_iter", ALS_MAX_ITER)
    mlflow.log_param("reg_param", ALS_REG_PARAM)
    mlflow.log_param("rank", ALS_RANK)
    mlflow.log_param("train_size", train_data.count())
    mlflow.log_param("test_size", test_data.count())
    mlflow.log_param("implicit_prefs", False)
    mlflow.log_param("cold_start_strategy", "drop")
    
    als = ALS(
        maxIter=ALS_MAX_ITER,
        regParam=ALS_REG_PARAM,
        rank=ALS_RANK,
        userCol="customer_idx",
        itemCol="product_idx",
        ratingCol="implicit_score",
        coldStartStrategy="drop",
        nonnegative=True,
        seed=RANDOM_SEED
    )
    
    print("  Training model...")
    model = als.fit(train_data)
    print("  ‚úÖ Model trained")
    
    print("  Generating predictions...")
    predictions = model.transform(test_data)
    predictions = predictions.filter(F.col("prediction").isNotNull())
    
    print("\nüìä Step 5: Evaluating model...")
    
    rmse_evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="implicit_score",
        predictionCol="prediction"
    )
    rmse = rmse_evaluator.evaluate(predictions)
    
    mae_evaluator = RegressionEvaluator(
        metricName="mae",
        labelCol="implicit_score",
        predictionCol="prediction"
    )
    mae = mae_evaluator.evaluate(predictions)
    
    r2_evaluator = RegressionEvaluator(
        metricName="r2",
        labelCol="implicit_score",
        predictionCol="prediction"
    )
    r2 = r2_evaluator.evaluate(predictions)
    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    print(f"\n  üìà Model Performance:")
    print(f"     RMSE: {rmse:.4f}")
    print(f"     MAE:  {mae:.4f}")
    print(f"     R¬≤:   {r2:.4f}")
    
    print("\nüíæ Step 6: Saving model...")

    from mlflow.models.signature import infer_signature

    signature = infer_signature(
        train_data.select("customer_idx", "product_idx", "implicit_score").toPandas(),
        model.transform(train_data).select("prediction").toPandas()
    )

    input_example = train_data.select("customer_idx", "product_idx", "implicit_score").limit(1).toPandas()

    mlflow.spark.log_model(
        model,
        "als_model",
        registered_model_name="workspace.ecommerce_silver.ecommerce_recommendation_als",
        dfs_tmpdir="/Volumes/workspace/ecommerce_silver/ml_models/tmp",
        signature=signature,
        input_example=input_example
    )

    model_path = f"{ML_PATH}/als_model"
    model.write().overwrite().save(model_path)
    mlflow.log_param("model_path", model_path)
    print(f"  ‚úÖ Model saved to: {model_path}")
    print(f"  ‚úÖ Model logged to MLflow")
    run_id = run.info.run_id
    print(f"  üìù MLflow Run ID: {run_id}")

print("\nüìã Step 7: Sample Predictions...")

predictions.select(
    "customer_idx", "product_idx", "implicit_score", "prediction"
).withColumn(
    "error", F.abs(F.col("implicit_score") - F.col("prediction"))
).orderBy(
    F.desc("implicit_score")
).limit(20)

display(predictions)

# Remove unpersist for serverless
# train_data.unpersist()
# test_data.unpersist()

print("\n" + "=" * 60)
print("ü§ñ ALS MODEL TRAINING COMPLETE!")
print("=" * 60)

In [0]:
import os

os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/workspace/ecommerce_silver/ml_models/tmp"

In [0]:
# Databricks notebook source
# ======================================
# ML: HYPERPARAMETER TUNING
# ======================================

# MAGIC %run ../config/project_config

from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow

print("ü§ñ ML: Hyperparameter Tuning")
print("=" * 60)

# Load data
ml_data = spark.table(f"{SILVER_DB}.ml_interaction_features") \
    .select("customer_idx", "product_idx", "implicit_score") \
    .filter(F.col("customer_idx").isNotNull())

train_data, test_data = ml_data.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_data.count():,}")

# ======================================
# SETUP GRID SEARCH
# ======================================
print("\nüîß Setting up hyperparameter grid...")

als = ALS(
    userCol="customer_idx",
    itemCol="product_idx",
    ratingCol="implicit_score",
    coldStartStrategy="drop",
    nonnegative=True,
    seed=42
)

# Parameter grid
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.maxIter, [10, 15]) \
    .addGrid(als.regParam, [0.01, 0.1, 0.5]) \
    .build()

print(f"  Total combinations: {len(param_grid)}")

# Evaluator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="implicit_score",
    predictionCol="prediction"
)

# Cross-validator
cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=4,
    seed=42
)

# ======================================
# RUN CROSS-VALIDATION
# ======================================
print("\nüèãÔ∏è Running cross-validation (this may take a while)...")

mlflow.set_experiment("/ecommerce-recommendation-tuning")

with mlflow.start_run(run_name="als_hyperparameter_tuning"):
    
    # Fit CV
    cv_model = cv.fit(train_data)
    
    # Best model
    best_model = cv_model.bestModel
    
    # Best parameters
    # Best parameters
    best_rank = best_model.rank
    best_reg = best_model.regParam
    best_iter = best_model.maxIter

    print(f"\nüìä Best Parameters:")
    print(f"   Rank: {best_rank}")
    print(f"   RegParam: {best_reg}")
    print(f"   MaxIter: {best_iter}")
    
    # Evaluate best model
    predictions = best_model.transform(test_data)
    rmse = evaluator.evaluate(predictions)
    
    print(f"\nüìà Best Model RMSE: {rmse:.4f}")
    
    # Log to MLflow
    mlflow.log_param("best_rank", best_rank)
    mlflow.log_param("best_reg_param", best_reg)
    mlflow.log_param("best_max_iter", best_iter)
    mlflow.log_metric("best_rmse", rmse)
    
    # Save best model
    best_model.write().overwrite().save(f"{ML_PATH}/als_model_tuned")
    mlflow.spark.log_model(best_model, "best_als_model")
    
    print(f"\n‚úÖ Best model saved to: {ML_PATH}/als_model_tuned")

train_data.unpersist()

print("\nü§ñ HYPERPARAMETER TUNING COMPLETE!")