In [None]:
# model_train.py
import mlflow
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from mlflow.models.signature import infer_signature # ‡§Æ‡•â‡§°‡§≤ ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§ï‡•á ‡§≤‡§ø‡§è
import os
import sys
import warnings

# ‡§ö‡•á‡§§‡§æ‡§µ‡§®‡•Ä (Warnings) ‡§ï‡•ã ‡§Ö‡§®‡§¶‡•á‡§ñ‡§æ ‡§ï‡§∞‡•á‡§Ç
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# ==================== CONFIGURATION ====================
EXPERIMENT_NAME = "/Shared/House_Price_Prediction_Delta_RF"
MODEL_ARTIFACT_PATH = "sklearn_rf_model"
DELTA_TABLE_NAME = "house_price_delta" 

# ‡§Æ‡•â‡§°‡§≤ ‡§î‡§∞ ‡§°‡•á‡§ü‡§æ ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞‡•á‡§∂‡§®
N_ESTIMATORS = 150
MAX_DEPTH = 12
RANDOM_STATE = 42
TEST_SIZE = 0.2
FEATURE_COLS = ['sq_feet', 'num_bedrooms', 'num_bathrooms', 'year_built', 'location_score']
LABEL_COL = 'price'

# ==================== FUNCTIONS ====================

def setup_mlflow_experiment():
    """MLflow ‡§ü‡•ç‡§∞‡•à‡§ï‡§ø‡§Ç‡§ó ‡§î‡§∞ ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•Ä ‡§ï‡•ã Databricks UC ‡§ï‡•á ‡§≤‡§ø‡§è ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞ ‡§ï‡§∞‡§§‡§æ ‡§π‡•à‡•§"""
    if "DATABRICKS_RUNTIME_VERSION" in os.environ:
        try:
            mlflow.set_tracking_uri("databricks") 
            # UC ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•á‡§∂‡§® ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ø‡§π ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§π‡•à
            mlflow.set_registry_uri("databricks-uc") 
            print("‚úì MLflow configured for Databricks UC (Tracking & Registry).")
        except Exception as e:
            # ‡§Ø‡§¶‡§ø ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞‡•á‡§∂‡§® ‡§µ‡§ø‡§´‡§≤ ‡§π‡•ã‡§§‡§æ ‡§π‡•à, ‡§§‡•ã ‡§Ü‡§ó‡•á ‡§¨‡§¢‡§º‡•á‡§Ç ‡§≤‡•á‡§ï‡§ø‡§® ‡§ö‡•á‡§§‡§æ‡§µ‡§®‡•Ä ‡§¶‡•á‡§Ç
            print(f"‚ö† Warning: MLflow Registry setup failed with: {e}")
            
    try:
        mlflow.set_experiment(EXPERIMENT_NAME)
        print(f"‚úì MLflow Experiment set to: {EXPERIMENT_NAME}")
    except Exception as e:
        print(f"‚ùå Critical: MLflow Experiment setup failed! Error: {e}")
        pass


def get_data_for_training(spark: SparkSession, table_name: str):
    """‡§°‡•á‡§≤‡•ç‡§ü‡§æ ‡§ü‡•á‡§¨‡§≤ ‡§∏‡•á ‡§°‡•á‡§ü‡§æ ‡§≤‡•ã‡§° ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ Pandas DataFrame ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡§§‡§æ ‡§π‡•à‡•§"""
    print(f"üíæ Loading data from Delta Table: {table_name}")
    try:
        # Delta Table ‡§∏‡•á Spark DataFrame ‡§≤‡•ã‡§° ‡§ï‡§∞‡•á‡§Ç
        df_spark = spark.read.format("delta").table(table_name)
        
        # ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§ï‡•â‡§≤‡§Æ ‡§ö‡•Å‡§®‡•á‡§Ç ‡§î‡§∞ Pandas DataFrame ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡•á‡§Ç
        df_pd = df_spark.select(*FEATURE_COLS, col(LABEL_COL)).toPandas()
        
        print(f"‚úì Data loaded. Total rows: {len(df_pd)}")
        
        X = df_pd[FEATURE_COLS]
        y = df_pd[LABEL_COL]
        
        return X, y
        
    except Exception as e:
        print(f"‚ùå Error loading data from Delta: {e}")
        return None, None


def train_and_log_model(X, y):
    """
    ‡§Æ‡•â‡§°‡§≤ ‡§ï‡•ã ‡§ü‡•ç‡§∞‡•á‡§® ‡§ï‡§∞‡§§‡§æ ‡§π‡•à, ‡§Æ‡•Ä‡§ü‡•ç‡§∞‡§ø‡§ï‡•ç‡§∏ ‡§ï‡•Ä ‡§ó‡§£‡§®‡§æ ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ UC ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•á‡§∂‡§® ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§Æ‡•â‡§°‡§≤ ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§ï‡•á ‡§∏‡§æ‡§• MLflow ‡§Æ‡•á‡§Ç ‡§≤‡•â‡§ó ‡§ï‡§∞‡§§‡§æ ‡§π‡•à‡•§
    """
    
    # 1. ‡§°‡•á‡§ü‡§æ ‡§∏‡•ç‡§™‡•ç‡§≤‡§ø‡§ü ‡§ï‡§∞‡•á‡§Ç
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
    
    with mlflow.start_run(run_name="RandomForest_House_Price_Model") as run:
        run_id = run.info.run_id
        print(f"üöÄ MLflow Run Started with ID: {run_id}")
        
        # 2. ‡§™‡•à‡§∞‡§æ‡§Æ‡•Ä‡§ü‡§∞‡•ç‡§∏ ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç
        mlflow.log_param("n_estimators", N_ESTIMATORS)
        mlflow.log_param("max_depth", MAX_DEPTH)

        # 3. ‡§Æ‡•â‡§°‡§≤ ‡§ü‡•ç‡§∞‡•á‡§®‡§ø‡§Ç‡§ó
        model = RandomForestRegressor(
            n_estimators=N_ESTIMATORS, 
            max_depth=MAX_DEPTH, 
            random_state=RANDOM_STATE, 
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        print("‚úì Model trained successfully.")

        # 4. ‡§Æ‡•Ä‡§ü‡•ç‡§∞‡§ø‡§ï‡•ç‡§∏ ‡§î‡§∞ ‡§≠‡§µ‡§ø‡§∑‡•ç‡§Ø‡§µ‡§æ‡§£‡§ø‡§Ø‡§æ‡§Å
        predictions = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)
        print(f"‚úì Metrics Logged: RMSE={rmse:.2f}, R2={r2:.4f}")
        
        # üí° 5. ‡§Æ‡•â‡§°‡§≤ ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§¨‡§®‡§æ‡§è‡§Ç (UC ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ö‡§®‡§ø‡§µ‡§æ‡§∞‡•ç‡§Ø)
        # infer_signature() ‡§Æ‡•á‡§Ç 'df=' keyword ‡§ï‡•ã ‡§π‡§ü‡§æ ‡§¶‡§ø‡§Ø‡§æ ‡§ó‡§Ø‡§æ ‡§π‡•à ‡§§‡§æ‡§ï‡§ø ‡§™‡•Å‡§∞‡§æ‡§®‡•Ä MLflow ‡§≤‡§æ‡§á‡§¨‡•ç‡§∞‡•á‡§∞‡•Ä ‡§Æ‡•á‡§Ç TypeError ‡§® ‡§Ü‡§è‡•§
        model_signature = infer_signature(
            X_train, # ‡§á‡§®‡§™‡•Å‡§ü ‡§´‡§º‡•Ä‡§ö‡§∞ (‡§™‡§π‡§≤‡§æ positional argument)
            model.predict(X_train) # ‡§Ü‡§â‡§ü‡§™‡•Å‡§ü
        )
        print("‚úì Model Signature created successfully.")

        # 6. ‡§Æ‡•â‡§°‡§≤ ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç (‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§∏‡§π‡§ø‡§§)
        # registered_model_name=None ‡§ï‡§æ ‡§â‡§™‡§Ø‡•ã‡§ó ‡§ï‡§∞‡§ï‡•á ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•á‡§∂‡§® ‡§ï‡•ã ‡§Ö‡§≤‡§ó ‡§∞‡§ñ‡§æ ‡§ó‡§Ø‡§æ ‡§π‡•à
        mlflow.sklearn.log_model(
            sk_model=model, 
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=model_signature, # ‚¨ÖÔ∏è ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§™‡§æ‡§∏ ‡§ï‡§∞‡•á‡§Ç
            registered_model_name=None 
        )
        print(f"‚úì Model logged with required signature to artifact path: {MODEL_ARTIFACT_PATH}")
        
        return run_id

# ==================== EXECUTION ====================

if __name__ == "__main__":
    try:
        # SparkSession ‡§ï‡•ã ‡§á‡§®‡§ø‡§∂‡§ø‡§Ø‡§≤‡§æ‡§á‡§ú‡§º ‡§Ø‡§æ ‡§™‡•ç‡§∞‡§æ‡§™‡•ç‡§§ ‡§ï‡§∞‡•á‡§Ç
        spark = SparkSession.builder.appName("ModelTrain").getOrCreate()
    except Exception as e:
        print(f"‚ùå SparkSession creation failed: {e}")
        sys.exit(1)

    setup_mlflow_experiment()
    
    # ‡§°‡•á‡§ü‡§æ ‡§≤‡•ã‡§° ‡§ï‡§∞‡•á‡§Ç
    X, y = get_data_for_training(spark, DELTA_TABLE_NAME)

    if X is not None:
        # ‡§Æ‡•â‡§°‡§≤ ‡§ü‡•ç‡§∞‡•á‡§® ‡§î‡§∞ ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç
        training_run_id = train_and_log_model(X, y)
        
        if training_run_id:
            print("\n" + "=" * 60)
            print(f"‚úÖ TRAINING & LOGGING COMPLETE! New Run ID: {training_run_id}")
            print(f"‡§Ö‡§ó‡§≤‡§æ ‡§ï‡§¶‡§Æ: 'model_register.py' ‡§ï‡•ã ‡§á‡§∏ Run ID ‡§ï‡•á ‡§∏‡§æ‡§• ‡§ö‡§≤‡§æ‡§è‡§Å‡•§")
            print("=" * 60)
        else:
            sys.exit(1)
    else:
        sys.exit(1)