In [None]:
# Databricks notebook source
# model_train.py
import mlflow
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from mlflow.models.signature import infer_signature # ‡§Æ‡•â‡§°‡§≤ ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§ï‡•á ‡§≤‡§ø‡§è
import os
import sys
import warnings

# ‡§ö‡•á‡§§‡§æ‡§µ‡§®‡•Ä (Warnings) ‡§ï‡•ã ‡§Ö‡§®‡§¶‡•á‡§ñ‡§æ ‡§ï‡§∞‡•á‡§Ç
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# ==================== CONFIGURATION ====================
EXPERIMENT_NAME = "/Shared/House_Price_Prediction_Delta_RF"
MODEL_ARTIFACT_PATH = "sklearn_rf_model"
DELTA_TABLE_NAME = "house_price_delta" 

# ‡§Æ‡•â‡§°‡§≤ ‡§î‡§∞ ‡§°‡•á‡§ü‡§æ ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞‡•á‡§∂‡§®
RANDOM_STATE = 42
TEST_SIZE = 0.2
FEATURE_COLS = ['sq_feet', 'num_bedrooms', 'num_bathrooms', 'year_built', 'location_score']
LABEL_COL = 'price'

# Cross-validation ‡§ï‡•á ‡§≤‡§ø‡§è parameter grid
PARAM_GRID = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 12, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
CV_FOLDS = 3

# ==================== FUNCTIONS ====================

def setup_mlflow_experiment():
    """MLflow ‡§ü‡•ç‡§∞‡•à‡§ï‡§ø‡§Ç‡§ó ‡§î‡§∞ ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•Ä ‡§ï‡•ã Databricks UC ‡§ï‡•á ‡§≤‡§ø‡§è ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞ ‡§ï‡§∞‡§§‡§æ ‡§π‡•à‡•§"""
    if "DATABRICKS_RUNTIME_VERSION" in os.environ:
        try:
            mlflow.set_tracking_uri("databricks") 
            # UC ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•á‡§∂‡§® ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ø‡§π ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§π‡•à
            mlflow.set_registry_uri("databricks-uc") 
            print("‚úì MLflow configured for Databricks UC (Tracking & Registry).")
        except Exception as e:
            # ‡§Ø‡§¶‡§ø ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞‡•á‡§∂‡§® ‡§µ‡§ø‡§´‡§≤ ‡§π‡•ã‡§§‡§æ ‡§π‡•à, ‡§§‡•ã ‡§Ü‡§ó‡•á ‡§¨‡§¢‡§º‡•á‡§Ç ‡§≤‡•á‡§ï‡§ø‡§® ‡§ö‡•á‡§§‡§æ‡§µ‡§®‡•Ä ‡§¶‡•á‡§Ç
            print(f"‚ö† Warning: MLflow Registry setup failed with: {e}")
            
    try:
        mlflow.set_experiment(EXPERIMENT_NAME)
        print(f"‚úì MLflow Experiment set to: {EXPERIMENT_NAME}")
    except Exception as e:
        print(f"‚ùå Critical: MLflow Experiment setup failed! Error: {e}")
        pass


def get_data_for_training(spark: SparkSession, table_name: str):
    """‡§°‡•á‡§≤‡•ç‡§ü‡§æ ‡§ü‡•á‡§¨‡§≤ ‡§∏‡•á ‡§°‡•á‡§ü‡§æ ‡§≤‡•ã‡§° ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ Pandas DataFrame ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡§§‡§æ ‡§π‡•à‡•§"""
    print(f"üíæ Loading data from Delta Table: {table_name}")
    try:
        # Delta Table ‡§∏‡•á Spark DataFrame ‡§≤‡•ã‡§° ‡§ï‡§∞‡•á‡§Ç
        df_spark = spark.read.format("delta").table(table_name)
        
        # ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§ï‡•â‡§≤‡§Æ ‡§ö‡•Å‡§®‡•á‡§Ç ‡§î‡§∞ Pandas DataFrame ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡•á‡§Ç
        df_pd = df_spark.select(*FEATURE_COLS, col(LABEL_COL)).toPandas()
        
        print(f"‚úì Data loaded. Total rows: {len(df_pd)}")
        
        X = df_pd[FEATURE_COLS]
        y = df_pd[LABEL_COL]
        
        return X, y
        
    except Exception as e:
        print(f"‚ùå Error loading data from Delta: {e}")
        return None, None


def train_and_log_model(X, y):
    """
    Cross-validation ‡§ï‡•á ‡§∏‡§æ‡§• ‡§Æ‡•â‡§°‡§≤ ‡§ï‡•ã ‡§ü‡•ç‡§∞‡•á‡§® ‡§ï‡§∞‡§§‡§æ ‡§π‡•à, best parameters ‡§¢‡•Ç‡§Ç‡§¢‡§§‡§æ ‡§π‡•à,
    ‡§Æ‡•Ä‡§ü‡•ç‡§∞‡§ø‡§ï‡•ç‡§∏ ‡§ï‡•Ä ‡§ó‡§£‡§®‡§æ ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ UC ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•á‡§∂‡§® ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§Æ‡•â‡§°‡§≤ ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§ï‡•á ‡§∏‡§æ‡§• MLflow ‡§Æ‡•á‡§Ç ‡§≤‡•â‡§ó ‡§ï‡§∞‡§§‡§æ ‡§π‡•à‡•§
    """
    
    # 1. ‡§°‡•á‡§ü‡§æ ‡§∏‡•ç‡§™‡•ç‡§≤‡§ø‡§ü ‡§ï‡§∞‡•á‡§Ç
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
    
    with mlflow.start_run(run_name="RandomForest_House_Price_Model_CV") as run:
        run_id = run.info.run_id
        print(f"üöÄ MLflow Run Started with ID: {run_id}")
        
        # 2. Cross-validation setup
        print(f"üîç Starting GridSearchCV with {CV_FOLDS}-fold cross-validation...")
        print(f"üìä Parameter Grid: {PARAM_GRID}")
        
        base_model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
        
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=PARAM_GRID,
            cv=CV_FOLDS,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        # 3. ‡§Æ‡•â‡§°‡§≤ ‡§ü‡•ç‡§∞‡•á‡§®‡§ø‡§Ç‡§ó with cross-validation
        grid_search.fit(X_train, y_train)
        print("‚úì GridSearchCV completed successfully.")
        
        # 4. Best parameters ‡§ï‡•ã ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç
        best_params = grid_search.best_params_
        print(f"‚úì Best Parameters found: {best_params}")
        
        for param_name, param_value in best_params.items():
            mlflow.log_param(f"best_{param_name}", param_value)
        
        # CV score ‡§ï‡•ã ‡§≠‡•Ä ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç
        best_cv_score = -grid_search.best_score_  # negative ‡§∏‡•á positive ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡•á‡§Ç
        mlflow.log_metric("best_cv_rmse", np.sqrt(best_cv_score))
        print(f"‚úì Best CV RMSE: {np.sqrt(best_cv_score):.2f}")
        
        # 5. Best model ‡§ï‡•á ‡§∏‡§æ‡§• test set ‡§™‡§∞ evaluate ‡§ï‡§∞‡•á‡§Ç
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)

        mlflow.log_metric("test_rmse", rmse)
        mlflow.log_metric("test_r2_score", r2)
        print(f"‚úì Test Metrics Logged: RMSE={rmse:.2f}, R2={r2:.4f}")
        
        # 6. ‡§Æ‡•â‡§°‡§≤ ‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§¨‡§®‡§æ‡§è‡§Ç (UC ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ö‡§®‡§ø‡§µ‡§æ‡§∞‡•ç‡§Ø)
        model_signature = infer_signature(
            X_train,
            best_model.predict(X_train)
        )
        print("‚úì Model Signature created successfully.")

        # 7. Best model ‡§ï‡•ã ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç (‡§∏‡§ø‡§ó‡•ç‡§®‡•á‡§ö‡§∞ ‡§∏‡§π‡§ø‡§§)
        mlflow.sklearn.log_model(
            sk_model=best_model, 
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=model_signature,
            registered_model_name=None 
        )
        print(f"‚úì Best model logged with signature to artifact path: {MODEL_ARTIFACT_PATH}")
        
        return run_id

# ==================== EXECUTION ====================

if __name__ == "__main__":
    try:
        # SparkSession ‡§ï‡•ã ‡§á‡§®‡§ø‡§∂‡§ø‡§Ø‡§≤‡§æ‡§á‡§ú‡§º ‡§Ø‡§æ ‡§™‡•ç‡§∞‡§æ‡§™‡•ç‡§§ ‡§ï‡§∞‡•á‡§Ç
        spark = SparkSession.builder.appName("ModelTrain").getOrCreate()
    except Exception as e:
        print(f"‚ùå SparkSession creation failed: {e}")
        sys.exit(1)

    setup_mlflow_experiment()
    
    # ‡§°‡•á‡§ü‡§æ ‡§≤‡•ã‡§° ‡§ï‡§∞‡•á‡§Ç
    X, y = get_data_for_training(spark, DELTA_TABLE_NAME)

    if X is not None:
        # ‡§Æ‡•â‡§°‡§≤ ‡§ü‡•ç‡§∞‡•á‡§® ‡§î‡§∞ ‡§≤‡•â‡§ó ‡§ï‡§∞‡•á‡§Ç
        training_run_id = train_and_log_model(X, y)
        
        if training_run_id:
            print("\n" + "=" * 60)
            print(f"‚úÖ TRAINING & LOGGING COMPLETE! New Run ID: {training_run_id}")
            print(f"‡§Ö‡§ó‡§≤‡§æ ‡§ï‡§¶‡§Æ: 'model_register.py' ‡§ï‡•ã ‡§á‡§∏ Run ID ‡§ï‡•á ‡§∏‡§æ‡§• ‡§ö‡§≤‡§æ‡§è‡§Å‡•§")
            print("=" * 60)
        else:
            sys.exit(1)
    else:
        sys.exit(1)