### Initialize Libraries

In [0]:
# Standard Data Libraries
import pandas as pd
import numpy as np

# MLflow Experiment Tracking
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

# Scikit-Learn Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Spark ML Tools
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

# --- Configuration: Tell MLflow where to save our work ---
user_name = spark.sql("SELECT current_user()").collect()[0][0]
mlflow.set_experiment(f"/Users/{user_name}/product_performance_model_v1")

print("Environment and tools are ready.")

2026/01/21 16:07:10 INFO mlflow.tracking.fluent: Experiment with name '/Users/tbhavya054@gmail.com/product_performance_model_v1' does not exist. Creating a new experiment.


Environment and tools are ready.


### Prepare the Data

In [0]:
# Load the data from cleaned table
table_name = "ecommerce.fact_product_performance"
spark_df = spark.table(table_name)

# Prepare Pandas data for Scikit-Learn experimentation
# Define input features (X) and the target to predict (y)
# Split the data: 80% for training and 20% for testing

df = spark_df.toPandas()
X = df[["views", "revenue"]] 
y = df["purchases"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare distributed Spark data for pipeline
train_spark, test_spark = spark_df.randomSplit([0.8, 0.2], seed=42)

### Model Experimentation & Performance Tracking

In [0]:
# Create an empty list to store results for final comparison
results = []

# Define and compare 3 different models to test 
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_experiment"):
        mlflow.log_param("model_type", name)
        
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        
        # Persisting the metrics and model signature
        mlflow.log_metric("r2_score", score)
        signature = infer_signature(X_test, model.predict(X_test))
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        # Add the name and score to results list
        results.append((name, score))
        print(f"Successfully tracked {name}")

# Comparison and Selection of ml model
print("\n--- Model Performance Comparison ---")

# Loop through the results list and print each one
for name, score in results:
    print(f"Model: {name} | R2 Score: {score:.4f}")

# identify the model with the highest score
best_model_name, best_model_score = max(results, key=lambda x: x[1])
print(f"\nThe best model is '{best_model_name}' with an R2 of {best_model_score:.4f}")



Successfully tracked linear




Successfully tracked decision_tree




Successfully tracked random_forest

--- Model Performance Comparison ---
Model: linear | R2 Score: 0.9300
Model: decision_tree | R2 Score: 0.8162
Model: random_forest | R2 Score: 0.9332

The best model is 'random_forest' with an R2 of 0.9332


### Build the Spark Pipeline (Using Random Forest)

In [0]:
from pyspark.ml.regression import RandomForestRegressor as SparkRF

# Group 'views' and 'revenue' into a single features column
assembler = VectorAssembler(inputCols=["views", "revenue"], outputCol="features")

# Set up the Random Forest model (our winner)
rf_spark = SparkRF(featuresCol="features", labelCol="purchases")

# Create the pipeline (the assembly line)
pipeline = Pipeline(stages=[assembler, rf_spark])

# Train the model using Spark data
pipeline_model = pipeline.fit(train_spark)

print("Pipeline is trained using the best model (Random Forest).")

Pipeline is trained using the best model (Random Forest).


### Model Application & Evaluation

In [0]:
# Apply the pipeline to the test data to get results
final_predictions = pipeline_model.transform(test_spark)

# Show the real values vs. what the model calculated
display(final_predictions.select("views", "revenue", "purchases", "prediction").limit(5))

print("Model Predictions Completed Successfully")


views,revenue,purchases,prediction
19722,4129.620000000001,36,185.16164334143232
48774,119711.03999999994,318,482.6030897051977
1248,0.0,0,1.5253590910544628
36,0.0,0,0.0041111004055122
13236,0.0,0,163.68523446723057


Model Predictions Completed Successfully
