In [None]:
# Databricks notebook source
# ==================================================================================
# üöÄ TRAINING SCRIPT - CONFIG DRIVEN (FIXED VERSION)
# ==================================================================================
# Now reads from pipeline_config.yml - No hardcoding!
# Your config.yml remains unchanged - only experiment parameters
# ==================================================================================

%pip install xgboost

import mlflow
import yaml
import numpy as np
import pandas as pd
import warnings
import os
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ TRAINING PIPELINE STARTED (CONFIG-DRIVEN)")
print("=" * 80)

# ==================================================================================
# ‚úÖ LOAD PIPELINE CONFIGURATION (NEW - REPLACES HARDCODING)
# ==================================================================================
print("\nüìã Loading pipeline configuration from pipeline_config.yml...")

try:
    with open("pipeline_config.yml", "r") as f:
        pipeline_cfg = yaml.safe_load(f)
    
    # Extract configuration values
    MODEL_TYPE = pipeline_cfg["model"]["type"]
    EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
    MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
    
    FEATURE_COLS = pipeline_cfg["data"]["features"]
    LABEL_COL = pipeline_cfg["data"]["label"]
    
    # Extract just table name (handle both formats)
    input_table = pipeline_cfg["data"]["input_table"]
    if "." in input_table:
        DELTA_TABLE_NAME = input_table.split(".")[-1]  # Extract last part
    else:
        DELTA_TABLE_NAME = input_table
    
    TEST_SIZE = pipeline_cfg["data"]["test_size"]
    RANDOM_STATE = pipeline_cfg["data"]["random_state"]
    
    print(f"‚úÖ Pipeline configuration loaded successfully!")
    print(f"\nüìä Configuration Details:")
    print(f"   Model Type: {MODEL_TYPE.upper()}")
    print(f"   Experiment: {EXPERIMENT_NAME}")
    print(f"   Artifact Path: {MODEL_ARTIFACT_PATH}")
    print(f"   Delta Table: {DELTA_TABLE_NAME}")
    print(f"   Features: {FEATURE_COLS}")
    print(f"   Label: {LABEL_COL}")
    print(f"   Test Size: {TEST_SIZE}")
    print(f"   Random State: {RANDOM_STATE}")
    
except FileNotFoundError:
    print("‚ùå ERROR: pipeline_config.yml not found!")
    print("üí° Please create pipeline_config.yml in the same directory")
    raise
except Exception as e:
    print(f"‚ùå ERROR loading configuration: {e}")
    raise

print("=" * 80)

# ==================================================================================
# ‚úÖ LOAD EXPERIMENT CONFIGURATIONS (YOUR EXISTING config.yml - NO CHANGE!)
# ==================================================================================
def load_config(path="config.yml"):
    """Load experiment hyperparameter configurations"""
    print(f"\nüìÑ Loading experiment configurations from: {path}")
    try:
        with open(path, "r") as f:
            config = yaml.safe_load(f)
        
        num_experiments = len(config["experiments"])
        print(f"‚úÖ Found {num_experiments} experiment configuration(s):")
        
        for i, exp in enumerate(config["experiments"], 1):
            print(f"   {i}. {exp['name']}")
        
        return config
        
    except FileNotFoundError:
        print(f"‚ùå ERROR: {path} not found!")
        print("üí° Please create config.yml with experiment configurations")
        raise
    except Exception as e:
        print(f"‚ùå ERROR loading experiments: {e}")
        raise

# ==================================================================================
# ‚úÖ READ DATA FROM DELTA
# ==================================================================================
def load_data(spark):
    """Load training data from Delta table"""
    print(f"\nüì¶ Loading data from Delta table: {DELTA_TABLE_NAME}")
    
    try:
        df = spark.read.format("delta").table(DELTA_TABLE_NAME)
        df_pd = df.select(*FEATURE_COLS, LABEL_COL).toPandas()
        
        X = df_pd[FEATURE_COLS]
        y = df_pd[LABEL_COL]
        
        print(f"‚úÖ Data loaded successfully!")
        print(f"   Total rows: {len(df_pd):,}")
        print(f"   Features shape: {X.shape}")
        print(f"   Label shape: {y.shape}")
        
        return X, y
        
    except Exception as e:
        print(f"‚ùå Failed to load data from table '{DELTA_TABLE_NAME}': {e}")
        print(f"üí° Make sure the Delta table exists and contains required columns")
        raise

# ==================================================================================
# ‚úÖ TRAIN ONE EXPERIMENT RUN
# ==================================================================================
def train_single_run(X, y, params, run_name):
    """
    Train a single model configuration
    
    Args:
        X: Feature data
        y: Target data  
        params: Model hyperparameters from config.yml
        run_name: Name for this MLflow run
        
    Returns:
        run_id: MLflow run ID
        rmse: Test RMSE score
    """
    print(f"\n{'='*70}")
    print(f"üîÅ Training: {run_name}")
    print(f"{'='*70}")
    print(f"Parameters: {params}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
    
    print(f"   Train size: {len(X_train):,} samples")
    print(f"   Test size: {len(X_test):,} samples")

    with mlflow.start_run(run_name=run_name) as run:
        run_id = run.info.run_id
        
        print(f"   MLflow Run ID: {run_id}")

        # Log parameters
        mlflow.log_param("model_type", MODEL_TYPE)
        for k, v in params.items():
            mlflow.log_param(k, v)

        # Train model
        print(f"   üèãÔ∏è Training {MODEL_TYPE.upper()} model...")
        
        model = XGBRegressor(
            objective='reg:squarederror',
            random_state=RANDOM_STATE,
            n_jobs=-1,
            **params
        )
        
        model.fit(X_train, y_train)
        print(f"   ‚úì Training complete")

        # Evaluate
        print(f"   üìä Evaluating on test set...")
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))

        mlflow.log_metric("test_rmse", rmse)
        print(f"   ‚úÖ Test RMSE: {rmse:.4f}")

        # Signature
        signature = infer_signature(X_train, model.predict(X_train))

        # Log the model (artifact_path from pipeline_config.yml)
        print(f"   üíæ Logging model to artifact path: {MODEL_ARTIFACT_PATH}")
        mlflow.xgboost.log_model(
            model, 
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=signature
        )
        
        print(f"   ‚úÖ Run '{run_name}' completed successfully!")

        return run_id, rmse

# ==================================================================================
# ‚úÖ MAIN EXECUTION
# ==================================================================================
if __name__ == "__main__":
    
    print("\nüîß Step 1: Initializing MLflow...")
    try:
        mlflow.set_tracking_uri("databricks")
        mlflow.set_registry_uri("databricks-uc")
        mlflow.set_experiment(EXPERIMENT_NAME)
        print(f"‚úÖ MLflow experiment set: {EXPERIMENT_NAME}")
    except Exception as e:
        print(f"‚ùå Failed to initialize MLflow: {e}")
        raise

    print("\nüîß Step 2: Initializing Spark...")
    try:
        spark = SparkSession.builder.appName("ConfigDrivenTraining").getOrCreate()
        print("‚úÖ Spark session created")
    except Exception as e:
        print(f"‚ùå Failed to initialize Spark: {e}")
        raise

    print("\nüîß Step 3: Loading training data...")
    X, y = load_data(spark)

    print("\nüîß Step 4: Loading experiment configurations...")
    config = load_config()

    print("\n" + "=" * 80)
    print("üöÄ STARTING TRAINING RUNS")
    print("=" * 80)
    
    # Store results
    run_results = []

    # Train all experiments
    for idx, exp in enumerate(config["experiments"], 1):
        name = exp["name"]
        params = exp["params"]
        
        print(f"\n[{idx}/{len(config['experiments'])}] Running experiment: {name}")
        
        try:
            run_id, rmse = train_single_run(X, y, params, run_name=name)
            run_results.append({
                'name': name,
                'run_id': run_id,
                'rmse': rmse,
                'params': params
            })
        except Exception as e:
            print(f"‚ùå Failed to train {name}: {e}")
            print(f"   Continuing with next experiment...")
            continue

    # Display summary
    print("\n" + "=" * 80)
    print("‚úÖ‚úÖ‚úÖ ALL TRAINING RUNS COMPLETED ‚úÖ‚úÖ‚úÖ")
    print("=" * 80)
    
    if run_results:
        print(f"\nüìä Training Results Summary ({len(run_results)} successful runs):")
        print(f"{'Rank':<6} {'Experiment Name':<40} {'RMSE':<15} {'Run ID':<40}")
        print("-" * 101)
        
        # Sort by RMSE (best first)
        sorted_results = sorted(run_results, key=lambda x: x['rmse'])
        
        for rank, result in enumerate(sorted_results, 1):
            marker = "üèÜ" if rank == 1 else f"{rank}."
            name = result['name']
            rmse = result['rmse']
            run_id = result['run_id']
            
            print(f"{marker:<6} {name:<40} {rmse:<15.4f} {run_id}")
        
        # Highlight best model
        best = sorted_results[0]
        print("\n" + "=" * 80)
        print("üèÜ BEST MODEL FROM THIS TRAINING SESSION")
        print("=" * 80)
        print(f"   Name: {best['name']}")
        print(f"   RMSE: {best['rmse']:.4f}")
        print(f"   Run ID: {best['run_id']}")
        print(f"   Parameters:")
        for k, v in best['params'].items():
            print(f"      {k}: {v}")
        print("=" * 80)
        
    else:
        print("\n‚ö†Ô∏è No successful training runs completed")
        print("üí° Check errors above and fix configuration")

    print("\nüìå Next Steps:")
    print("   1. Run model_evaluation_final_fixed.py to evaluate ALL models in experiment")
    print("   2. Best model will be automatically selected by metrics")
    print("   3. If approved, it will be registered to Unity Catalog")
    print("   4. Then UAT ‚Üí Production pipeline will execute")
    
    print("\nüí° Note:")
    print(f"   All {len(run_results)} models are now logged to experiment: {EXPERIMENT_NAME}")
    print("   Evaluation script will compare ALL models (including previous runs)")
    print("   and select the absolute best one based on test_rmse")
    
    print("\n" + "=" * 80)
    
    # Save metadata for workflow (optional)
    try:
        dbutils.jobs.taskValues.set(key="model_type", value=MODEL_TYPE)
        dbutils.jobs.taskValues.set(key="experiment_name", value=EXPERIMENT_NAME)
        dbutils.jobs.taskValues.set(key="num_experiments", value=len(run_results))
        if run_results:
            dbutils.jobs.taskValues.set(key="best_rmse", value=sorted_results[0]['rmse'])
        print("‚úÖ Task values saved for workflow")
    except:
        print("‚ÑπÔ∏è Not running in Databricks workflow - skipping task values")
    
    print("\nüéâ Training pipeline completed successfully!")
    print("=" * 80)