#### **Train 3 Different Models**

In [0]:
import os
import mlflow
import mlflow.spark

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor
)
from pyspark.ml.evaluation import RegressionEvaluator

# Required for Unity Catalog
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/default/kaggle_volume/mlflow_tmp"

# Load data
data = (
    spark.table("ecommerce.gold.daily_sales")
    .select("total_events", "total_revenue")
    .dropna()
)

train_df, test_df = data.randomSplit([0.8, 0.2], seed=42)

# Feature vector
assembler = VectorAssembler(
    inputCols=["total_events"],
    outputCol="features"
)

train_vec = assembler.transform(train_df).select("features", "total_revenue")
test_vec  = assembler.transform(test_df).select("features", "total_revenue")

# Models
models = {
    "LinearRegression": LinearRegression(labelCol="total_revenue"),
    "DecisionTree": DecisionTreeRegressor(labelCol="total_revenue", maxDepth=5),
    "RandomForest": RandomForestRegressor(labelCol="total_revenue", numTrees=50)
}

evaluator = RegressionEvaluator(
    labelCol="total_revenue",
    predictionCol="prediction",
    metricName="rmse"
)

mlflow.set_experiment("/Shared/mlflow_model_comparison")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        mlflow.log_param("model", name)
        mlflow.log_param("features", "total_events")

        fitted_model = model.fit(train_vec)
        predictions = fitted_model.transform(test_vec)
        rmse = evaluator.evaluate(predictions)

        mlflow.log_metric("rmse", rmse)
        mlflow.spark.log_model(fitted_model, "model")

        print(f"{name} | RMSE = {rmse:.2f}")




LinearRegression | RMSE = 5108240.78




DecisionTree | RMSE = 5832868.43




RandomForest | RMSE = 5680350.66


#### **Compare Metrics in MLflow**

In [0]:
import mlflow

runs = mlflow.search_runs()

runs[[
    "tags.mlflow.runName",
    "metrics.rmse",
    "params.model"
]]


Unnamed: 0,tags.mlflow.runName,metrics.rmse,params.model
0,RandomForest,5680351.0,RandomForest
1,DecisionTree,5832868.0,DecisionTree
2,LinearRegression,5108241.0,LinearRegression
3,linear_model,5680351.0,LinearRegression


In [0]:
clean_runs = runs[[
    "tags.mlflow.runName",
    "metrics.rmse",
    "params.model"
]].rename(columns={
    "tags.mlflow.runName": "run_name",
    "metrics.rmse": "rmse",
    "params.model": "model"
})

clean_runs


Unnamed: 0,run_name,rmse,model
0,RandomForest,5680351.0,RandomForest
1,DecisionTree,5832868.0,DecisionTree
2,LinearRegression,5108241.0,LinearRegression
3,linear_model,5680351.0,LinearRegression


#### **Build Spark ML Pipeline**

In [0]:
import os
import mlflow
import mlflow.spark

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/default/kaggle_volume/mlflow_tmp"

data = (
    spark.table("ecommerce.gold.daily_sales")
    .select("total_events", "total_revenue")
    .dropna()
)

train, test = data.randomSplit([0.8, 0.2], seed=42)

assembler = VectorAssembler(
    inputCols=["total_events"],
    outputCol="features"
)

rf = RandomForestRegressor(
    labelCol="total_revenue",
    numTrees=50
)

pipeline = Pipeline(stages=[assembler, rf])

with mlflow.start_run(run_name="RandomForest_Pipeline"):
    pipeline_model = pipeline.fit(train)
    predictions = pipeline_model.transform(test)

    evaluator = RegressionEvaluator(
        labelCol="total_revenue",
        predictionCol="prediction",
        metricName="rmse"
    )

    rmse = evaluator.evaluate(predictions)

    mlflow.log_param("model", "RandomForestPipeline")
    mlflow.log_param("features", "total_events")
    mlflow.log_metric("rmse", rmse)
    mlflow.spark.log_model(pipeline_model, "pipeline_model")

    print("Pipeline RMSE:", rmse)




Pipeline RMSE: 5680350.65519515


#### **Select Best Model**

In [0]:
best_run = clean_runs.sort_values("rmse").iloc[0]

print("Best Model:", best_run["model"])
print("Best RMSE:", best_run["rmse"])


Best Model: LinearRegression
Best RMSE: 5108240.7764305165
