In [0]:
# import mlflow
# import mlflow.sklearn
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# from mlflow.models.signature import infer_signature
# from mlflow.tracking import MlflowClient

# # Initialize MLflow client
# client = MlflowClient()

# # Load data from Delta table
# df = spark.read.format("delta").table("house_price_delta").toPandas()

# # Features and target
# X = df[["sq_feet", "num_bedrooms", "num_bathrooms", "year_built", "location_score"]]
# y = df["price"]

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# # Set experiment path (workspace-level)
# mlflow.set_experiment("/mlops_house_price_prediction_experiment")

# with mlflow.start_run() as run:
#     # Train model
#     model = RandomForestRegressor(n_estimators=100, random_state=42)
#     model.fit(X_train, y_train)

#     # Predictions
#     predictions = model.predict(X_test)

#     # Metrics
#     mse = mean_squared_error(y_test, predictions)
#     r2 = r2_score(y_test, predictions)

#     # Log params and metrics
#     mlflow.log_param("n_estimators", 100)
#     mlflow.log_metric("mse", mse)
#     mlflow.log_metric("r2_score", r2)

#     # Infer model signature
#     signature = infer_signature(X_train, predictions)
    
#     # ----------------------------------------------------
#     # सही Unity Catalog format का उपयोग करें
#     # ----------------------------------------------------
#     # Unity Catalog में मॉडल का नाम: catalog.schema.model_name
#     registered_model_name = "workspace.ml.house_price_model"
#     current_params = model.get_params()
    
#     should_log = True
#     try:
#         # नवीनतम मॉडल वर्जन प्राप्त करें
#         latest_versions = client.get_latest_versions(registered_model_name, stages=["None"])
#         if latest_versions:
#             latest_version = latest_versions[0]
            
#             # नवीनतम वर्जन के पैरामीटर प्राप्त करें
#             run_info = client.get_run(latest_version.run_id)
#             latest_params_dict = run_info.data.params
            
#             # यदि पैरामीटर समान हैं तो लॉग न करें
#             if all(latest_params_dict.get(key) == str(value) for key, value in current_params.items()):
#                 print("✅ Parameters are the same. Skipping new model version registration.")
#                 should_log = False
#         else:
#             print("No existing versions found. Registering the first version.")
            
#     except Exception as e:
#         # यदि कोई मॉडल मौजूद नहीं है, तो पहली बार लॉग करें
#         print(f"Model not found or error occurred: {e}")
#         print("Registering the first version of the model.")
        
#     if should_log:
#         try:
#             # मॉडल को यूनिटी कैटलॉग में लॉग और रजिस्टर करें
#             model_info = mlflow.sklearn.log_model(
#                 sk_model=model,
#                 artifact_path="house_price_model",
#                 signature=signature,
#                 registered_model_name=registered_model_name
#             )
#             print(f"✅ Model registered successfully: {model_info.model_uri}")
            
#         except Exception as reg_error:
#             print(f"❌ Error during model registration: {reg_error}")
#             # फिर भी मॉडल को लॉग करें (बिना registration के)
#             model_info = mlflow.sklearn.log_model(
#                 sk_model=model,
#                 artifact_path="house_price_model",
#                 signature=signature
#             )
#             print(f"✅ Model logged (without registration): {model_info.model_uri}")
#     else:
#         print("Model registration skipped due to identical parameters.")
        
#     print(f"Run ID: {run.info.run_id}")
#     print(f"MSE: {mse:.2f}")
#     print(f"R² Score: {r2:.2f}")

import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
from pyspark.sql import SparkSession
import os

def setup_mlflow_config():
    """MLflow configuration को Unity Catalog के लिए setup करें"""
    try:
        # Unity Catalog के लिए MLflow registry URI set करें
        mlflow.set_registry_uri("databricks-uc")
        
        # Tracking URI भी set करें
        if "DATABRICKS_RUNTIME_VERSION" in os.environ:
            mlflow.set_tracking_uri("databricks")
        
        print("✅ MLflow configuration setup completed")
        return True
        
    except Exception as e:
        print(f"⚠️ MLflow config setup failed: {e}")
        return False

# Configuration setup
config_success = setup_mlflow_config()

# Initialize MLflow client
try:
    client = MlflowClient()
    print("✅ MLflow client initialized successfully")
except Exception as e:
    print(f"❌ MLflow client initialization failed: {e}")
    client = None

# Load data from Delta table
spark = SparkSession.builder.appName("ModelTraining").getOrCreate()
df = spark.read.format("delta").table("house_price_delta").toPandas()
print(f"✅ Data loaded: {df.shape}")

# Features and target
X = df[["sq_feet", "num_bedrooms", "num_bathrooms", "year_built", "location_score"]]
y = df["price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Set experiment path (workspace-level)
try:
    mlflow.set_experiment("/Shared/mlops_house_price_prediction_experiment")
    print("✅ Experiment set successfully")
except Exception as e:
    print(f"⚠️ Experiment setup warning: {e}")
    mlflow.set_experiment("mlops_house_price_prediction_experiment")

with mlflow.start_run() as run:
    print(f"🚀 MLflow run started: {run.info.run_id}")
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    predictions = model.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Log params and metrics
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)
    
    print(f"📊 Metrics - MSE: {mse:.2f}, R²: {r2:.4f}")

    # Infer model signature
    signature = infer_signature(X_train, predictions)
    
    # Model registration logic
    if config_success and client:
        # Unity Catalog में registered model name
        registered_model_name = "workspace.ml.house_price_model"
        current_params = {"n_estimators": 100, "random_state": 42}
        
        should_register = True
        
        try:
            # Check करें कि model पहले से exist करता है या नहीं
            latest_versions = client.get_latest_versions(registered_model_name)
            
            if latest_versions:
                latest_version = latest_versions[0]
                run_info = client.get_run(latest_version.run_id)
                latest_params = run_info.data.params
                
                # Parameters compare करें
                params_match = all(
                    latest_params.get(key) == str(value) 
                    for key, value in current_params.items()
                )
                
                if params_match:
                    print("⏭️ Same parameters detected. Skipping registration.")
                    should_register = False
                    
        except Exception as e:
            print(f"📝 First time registration (or error checking): {e}")
            should_register = True
        
        # Model को log और register करें
        try:
            if should_register:
                # Model को Unity Catalog में register करें
                model_info = mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="house_price_model",
                    signature=signature,
                    registered_model_name=registered_model_name,
                    metadata={"training_dataset": "house_price_delta"}
                )
                print(f"✅ Model registered in Unity Catalog: {model_info.model_uri}")
                
            else:
                # सिर्फ log करें, register नहीं करें
                model_info = mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="house_price_model", 
                    signature=signature
                )
                print(f"📝 Model logged (without registration): {model_info.model_uri}")
                
        except Exception as reg_error:
            print(f"⚠️ Registration failed, trying without Unity Catalog: {reg_error}")
            
            # Fallback: बिना registration के log करें
            try:
                model_info = mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="house_price_model",
                    signature=signature
                )
                print(f"✅ Model logged successfully (fallback): {model_info.model_uri}")
                
            except Exception as log_error:
                print(f"❌ Model logging also failed: {log_error}")
                
    else:
        # अगर client या config fail हो गया हो
        print("⚠️ Using fallback logging method...")
        try:
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="house_price_model",
                signature=signature
            )
            print(f"✅ Model logged (fallback method): {model_info.model_uri}")
            
        except Exception as fallback_error:
            print(f"❌ Fallback logging failed: {fallback_error}")

print("\n" + "="*50)
print("🎯 TRAINING SUMMARY")
print("="*50)
print(f"Run ID: {run.info.run_id}")
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"Model Performance: {'Good' if r2 > 0.7 else 'Needs Improvement'}")
print("="*50)
