In [None]:
# training_script.py
%pip install xgboost
import mlflow
import yaml
import numpy as np
import pandas as pd
import warnings
import os
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

warnings.filterwarnings("ignore")

# ==========================================================
# ‚úÖ MLflow FIXED SETTINGS (Matches registration script)
# ==========================================================
EXPERIMENT_NAME = "/Shared/House_Price_Prediction_Config_Runs"
MODEL_ARTIFACT_PATH = "xgboost_model"

FEATURE_COLS = ['sq_feet', 'num_bedrooms', 'num_bathrooms', 'year_built', 'location_score']
LABEL_COL = 'price'
DELTA_TABLE_NAME = "house_price_delta"

TEST_SIZE = 0.2
RANDOM_STATE = 42


# ==========================================================
# ‚úÖ Load Config.yaml
# ==========================================================
def load_config(path="config.yml"):
    print(f"üìÑ Loading config file: {path}")
    with open(path, "r") as f:
        return yaml.safe_load(f)


# ==========================================================
# ‚úÖ Read Data From Delta
# ==========================================================
def load_data(spark):
    df = spark.read.format("delta").table(DELTA_TABLE_NAME)
    df_pd = df.select(*FEATURE_COLS, LABEL_COL).toPandas()
    X = df_pd[FEATURE_COLS]
    y = df_pd[LABEL_COL]
    print(f"‚úÖ Loaded {len(df_pd)} rows from Delta Table")
    return X, y


# ==========================================================
# ‚úÖ Train One Run
# ==========================================================
def train_single_run(X, y, params, run_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    with mlflow.start_run(run_name=run_name) as run:
        run_id = run.info.run_id

        # Log parameters
        for k, v in params.items():
            mlflow.log_param(k, v)

        # Train model
        model = XGBRegressor(
            objective='reg:squarederror',
            random_state=RANDOM_STATE,
            n_jobs=-1,
            **params
        )
        model.fit(X_train, y_train)

        # Evaluate
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))

        mlflow.log_metric("test_rmse", rmse)
        print(f"‚úÖ Run '{run_name}' ‚Üí RMSE: {rmse:.3f}")

        # Signature
        signature = infer_signature(X_train, model.predict(X_train))

        # Log the model EXACTLY with artifact_path used by registration script
        mlflow.xgboost.log_model(
            model, 
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=signature
        )

        return run_id, rmse


# ==========================================================
# ‚úÖ Main: Multiple runs from config file
# ==========================================================
if __name__ == "__main__":
    print("üöÄ Starting Training Pipeline...")

    # MLflow Setup
    mlflow.set_tracking_uri("databricks")
    mlflow.set_registry_uri("databricks-uc")
    mlflow.set_experiment(EXPERIMENT_NAME)

    # Spark
    spark = SparkSession.builder.appName("TrainingPipeline").getOrCreate()

    # Load Data
    X, y = load_data(spark)

    # Load Experiment Configurations
    config = load_config()

    # List to store (run_id, metric)
    run_results = []

    # Run all experiments
    for exp in config["experiments"]:
        name = exp["name"]
        params = exp["params"]
        print(f"\nüîÅ Running Experiment: {name}")
        run_id, rmse = train_single_run(X, y, params, run_name=name)
        run_results.append((name, run_id, rmse))

    print("\n‚úÖ‚úÖ‚úÖ ALL RUNS COMPLETED ‚úÖ‚úÖ‚úÖ")

    # Show summary
    for name, run_id, rmse in run_results:
        print(f"{name} ‚Üí RMSE = {rmse:.4f} (run_id={run_id})")

    print("\nüìå Next Step: Run Model_Registration.ipynb to register best model.")
