In [None]:
# model_train.py
import mlflow
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from mlflow.models.signature import infer_signature
import os
import sys
import warnings

# ‡§ö‡•á‡§§‡§æ‡§µ‡§®‡•Ä (Warnings) ‡§ï‡•ã ‡§Ö‡§®‡§¶‡•á‡§ñ‡§æ ‡§ï‡§∞‡•á‡§Ç
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# ==================== CONFIGURATION ====================
EXPERIMENT_NAME = "/Shared/House_Price_Prediction_Delta_RF"
MODEL_ARTIFACT_PATH = "sklearn_rf_model"
DELTA_TABLE_NAME = "house_price_scaled_delta"   # ‚úÖ scaled table use karni hai

# ‡§Æ‡•â‡§°‡§≤ ‡§î‡§∞ ‡§°‡•á‡§ü‡§æ ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞‡•á‡§∂‡§®
RANDOM_STATE = 42
TEST_SIZE = 0.2

# ‚úÖ Updated feature columns (scaled ones)
FEATURE_COLS = [
    'sq_feet_scaled',
    'num_bedrooms_scaled',
    'num_bathrooms_scaled',
    'year_built_scaled',
    'location_score_scaled'
]

LABEL_COL = 'label'   # ‚úÖ scaling script ke baad label ka naam yeh hi hai

# Cross-validation ‡§ï‡•á ‡§≤‡§ø‡§è parameter grid
PARAM_GRID = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 12, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
CV_FOLDS = 3

# ==================== FUNCTIONS ====================

def setup_mlflow_experiment():
    """MLflow ‡§ü‡•ç‡§∞‡•à‡§ï‡§ø‡§Ç‡§ó ‡§î‡§∞ ‡§∞‡§ú‡§ø‡§∏‡•ç‡§ü‡•ç‡§∞‡•Ä ‡§ï‡•ã Databricks UC ‡§ï‡•á ‡§≤‡§ø‡§è ‡§ï‡•â‡§®‡•ç‡§´‡§º‡§ø‡§ó‡§∞ ‡§ï‡§∞‡§§‡§æ ‡§π‡•à‡•§"""
    if "DATABRICKS_RUNTIME_VERSION" in os.environ:
        try:
            mlflow.set_tracking_uri("databricks")
            mlflow.set_registry_uri("databricks-uc")
            print("‚úì MLflow configured for Databricks UC (Tracking & Registry).")
        except Exception as e:
            print(f"‚ö† Warning: MLflow Registry setup failed with: {e}")
            
    try:
        mlflow.set_experiment(EXPERIMENT_NAME)
        print(f"‚úì MLflow Experiment set to: {EXPERIMENT_NAME}")
    except Exception as e:
        print(f"‚ùå Critical: MLflow Experiment setup failed! Error: {e}")
        pass


def get_data_for_training(spark: SparkSession, table_name: str):
    """‡§°‡•á‡§≤‡•ç‡§ü‡§æ ‡§ü‡•á‡§¨‡§≤ ‡§∏‡•á ‡§∏‡•ç‡§ï‡•á‡§≤‡•ç‡§° ‡§°‡•á‡§ü‡§æ ‡§≤‡•ã‡§° ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ Pandas DataFrame ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡§§‡§æ ‡§π‡•à‡•§"""
    print(f"üíæ Loading data from Delta Table: {table_name}")
    try:
        df_spark = spark.read.format("delta").table(table_name)
        
        # ‚úÖ Scaled feature columns and label select karna
        df_pd = df_spark.select(*FEATURE_COLS, col(LABEL_COL)).toPandas()
        
        print(f"‚úì Data loaded from scaled Delta table. Total rows: {len(df_pd)}")
        
        X = df_pd[FEATURE_COLS]
        y = df_pd[LABEL_COL]
        
        return X, y
        
    except Exception as e:
        print(f"‚ùå Error loading data from Delta: {e}")
        return None, None


def train_and_log_model(X, y):
    """Cross-validation ‡§ï‡•á ‡§∏‡§æ‡§• ‡§Æ‡•â‡§°‡§≤ ‡§ï‡•ã ‡§ü‡•ç‡§∞‡•á‡§® ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ MLflow ‡§Æ‡•á‡§Ç ‡§≤‡•â‡§ó ‡§ï‡§∞‡§§‡§æ ‡§π‡•à‡•§"""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
    
    with mlflow.start_run(run_name="RandomForest_House_Price_Model_CV") as run:
        run_id = run.info.run_id
        print(f"üöÄ MLflow Run Started with ID: {run_id}")
        
        base_model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
        
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=PARAM_GRID,
            cv=CV_FOLDS,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        print("‚úì GridSearchCV completed successfully.")
        
        best_params = grid_search.best_params_
        for param_name, param_value in best_params.items():
            mlflow.log_param(f"best_{param_name}", param_value)
        
        best_cv_score = -grid_search.best_score_
        mlflow.log_metric("best_cv_rmse", np.sqrt(best_cv_score))
        
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)

        mlflow.log_metric("test_rmse", rmse)
        mlflow.log_metric("test_r2_score", r2)
        print(f"‚úì Test Metrics Logged: RMSE={rmse:.2f}, R2={r2:.4f}")
        
        model_signature = infer_signature(X_train, best_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=best_model, 
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=model_signature,
            registered_model_name=None 
        )
        print(f"‚úì Best model logged with signature to artifact path: {MODEL_ARTIFACT_PATH}")
        
        return run_id


# ==================== EXECUTION ====================

if __name__ == "__main__":
    try:
        spark = SparkSession.builder.appName("ModelTrain").getOrCreate()
    except Exception as e:
        print(f"‚ùå SparkSession creation failed: {e}")
        sys.exit(1)

    setup_mlflow_experiment()
    
    X, y = get_data_for_training(spark, DELTA_TABLE_NAME)

    if X is not None:
        training_run_id = train_and_log_model(X, y)
        
        if training_run_id:
            print("\n" + "=" * 60)
            print(f"‚úÖ TRAINING & LOGGING COMPLETE! New Run ID: {training_run_id}")
            print(f"‡§Ö‡§ó‡§≤‡§æ ‡§ï‡§¶‡§Æ: 'model_register.py' ‡§ï‡•ã ‡§á‡§∏ Run ID ‡§ï‡•á ‡§∏‡§æ‡§• ‡§ö‡§≤‡§æ‡§è‡§Å‡•§")
            print("=" * 60)
        else:
            sys.exit(1)
    else:
        sys.exit(1)
