# 4. ML Model Training (Distributed)

**Purpose:** This notebook loads the pre-processed Gold ML table (`gold_ml_features_experimental`) and uses the distributed `pyspark.ml` library to train and evaluate classification models.

**Method:**
1.  Load the vectorized Gold table.
2.  Split the data into training and test sets.
3.  Define the ML models (e.g., Decision Tree, Random Forest) using `pyspark.ml`.
4.  Run MLflow experiments to track results.
5.  Save the best model.

In [0]:
TEMP_VOLUME_PATH = "/Volumes/workspace/default/ds_capstone/_mlflow_temp"

# Create this directory if it doesn't exist
try:
    dbutils.fs.mkdirs(TEMP_VOLUME_PATH)
    print(f"✅ Created temporary directory: {TEMP_VOLUME_PATH}")
except Exception as e:
    print(f"⚠️  Could not create directory (it may already exist): {e}")

In [0]:
# --- 1. Load Your ML-Ready Gold Table ---

# The data is already vectorized and has a 'features' and 'label' column
gold_table_name = "default.gold_ml_features_experimental"

try:
    df_ml = spark.table(gold_table_name)
    print(f"✅ Successfully loaded ML-ready table: {gold_table_name}")
    print(f"Total rows: {df_ml.count():,}")
    
    print("\nSchema:")
    df_ml.printSchema()
    
    print("\nSample data:")
    df_ml.show(5)
    
except Exception as e:
    print(f"❌ ERROR: Could not load table '{gold_table_name}'.")
    print(f"   Did you run the Gold ETL notebook first?")
    print(f"   Error: {e}")

# --- 2. Split the Data ---
# This is the standard, reliable way to split your data
(training_data, test_data) = df_ml.randomSplit([0.8, 0.2], seed=42)

print(f"\nData split complete:")
print(f"Training set: {training_data.count():,} rows")
print(f"Test set:     {test_data.count():,} rows")

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.classification import (
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from datetime import datetime

# --- 1. Set up MLflow Experiment ---
# (This can be in its own cell if you prefer)

mlflow.set_tracking_uri("databricks")
experiment_name = "/Users/kshitijmishra231@gmail.com/Flightmasters_Prediction_SparkML"
mlflow.set_experiment(experiment_name)

print(f"MLflow experiment set to: {experiment_name}")

# --- 2. Define our Evaluator ---
# We'll re-use this for all models.
# It will calculate all these metrics at once.
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="f1" # We can ask for f1, accuracy, etc.
)






In [0]:
# --- 3. Experiment 1: Decision Tree ---
print("\n=== Experiment 1: Decision Tree ===")
with mlflow.start_run(run_name=f"DecisionTree_{datetime.now().strftime('%Y%m%d_%H%M%S')}") as run:
    
    # --- a. Configure and Train ---
    dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxDepth=5)
    
    # Log parameters
    mlflow.log_param("model_type", "DecisionTreeClassifier")
    mlflow.log_param("max_depth", 5)

    # Train the model
    dt_model = dt.fit(training_data)

    # --- b. Evaluate ---
    predictions = dt_model.transform(test_data)
    
    # Get metrics
    f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    
    # Log metrics
    mlflow.log_metric("f1_score", f1_score)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)

    # --- c. Log Model ---
    mlflow.spark.log_model(dt_model, "spark-model", dfs_tmpdir=TEMP_VOLUME_PATH)
    
    print(f"Decision Tree F1 Score: {f1_score:.4f}")
    print(f"Run ID: {run.info.run_id}")


In [0]:
# --- 4. Experiment 2: Random Forest ---
print("\n=== Experiment 2: Random Forest ===")
with mlflow.start_run(run_name=f"RandomForest_{datetime.now().strftime('%Y%m%d_%H%M%S')}") as run:
    
    # --- a. Configure and Train ---
    rf = RandomForestClassifier(
        featuresCol="features", 
        labelCol="label", 
        numTrees=100, 
        maxDepth=5
    )
    
    # Log parameters
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("num_trees", 100)
    mlflow.log_param("max_depth", 5)

    # Train the model
    rf_model = rf.fit(training_data)

    # --- b. Evaluate ---
    predictions = rf_model.transform(test_data)
    
    # Get metrics
    f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    
    # Log metrics
    mlflow.log_metric("f1_score", f1_score)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)

    # --- c. Log Model ---
    mlflow.spark.log_model(rf_model, "spark-model", dfs_tmpdir=TEMP_VOLUME_PATH)
    
    print(f"Random Forest F1 Score: {f1_score:.4f}")
    print(f"Run ID: {run.info.run_id}")

In [0]:
# --- 5. Experiment 3: Gradient Boosting (GBT) ---
print("\n=== Experiment 3: Gradient Boosting (GBT) ===")
with mlflow.start_run(run_name=f"GBT_{datetime.now().strftime('%Y%m%d_%H%M%S')}") as run:
    
    # --- a. Configure and Train ---
    gbt = GBTClassifier(
        featuresCol="features", 
        labelCol="label", 
        maxIter=10,
        maxDepth=5
    )
    
    # Log parameters
    mlflow.log_param("model_type", "GBTClassifier")
    mlflow.log_param("max_iter", 10)
    mlflow.log_param("max_depth", 5)

    # Train the model
    gbt_model = gbt.fit(training_data)

    # --- b. Evaluate ---
    predictions = gbt_model.transform(test_data)
    
    # Get metrics
    f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    
    # Log metrics
    mlflow.log_metric("f1_score", f1_score)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)

    # --- c. Log Model ---
    mlflow.spark.log_model(gbt_model, "spark-model", dfs_tmpdir=TEMP_VOLUME_PATH)
    
    print(f"GBT F1 Score: {f1_score:.4f}")
    print(f"Run ID: {run.info.run_id}")

print("\n✅ All experiments complete. Check the MLflow UI!")