In [None]:
import mlflow
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import pickle # DataFrames ko serialize karne ke liye

# Configuration
MLFLOW_EXPERIMENT_PATH = "/Shared/mlops_house_price_prediction_experiment"
DELTA_TABLE_NAME = "workspace.default.house_price_delta"
DATA_ARTIFACT_PATH = "prepared_data"

def setup_mlflow():
    """MLflow tracking aur registry URIs ko set karta hai."""
    if "DATABRICKS_RUNTIME_VERSION" in os.environ:
        mlflow.set_tracking_uri("databricks")
        # Unity Catalog use kar rahe hain
        mlflow.set_registry_uri("databricks-uc") 
        print("✅ MLflow set up for Databricks Unity Catalog.")
    try:
        mlflow.set_experiment(MLFLOW_EXPERIMENT_PATH)
        print(f"✅ Experiment set to: {MLFLOW_EXPERIMENT_PATH}")
    except Exception as e:
        print(f"⚠️ Could not set experiment path. Falling back to default: {e}")
        mlflow.set_experiment("mlops_house_price_prediction_experiment")

def run_data_preparation():
    """Data load, preprocess aur artifacts ko log karta hai."""
    setup_mlflow()
    
    spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
    
    with mlflow.start_run(run_name="01_Data_Prep") as run:
        print(f"🚀 MLflow Data Preparation run started: {run.info.run_id}")
        
        # 1. Data Load
        try:
            df = spark.read.format("delta").table(DELTA_TABLE_NAME).toPandas()
            print(f"✅ Data loaded from {DELTA_TABLE_NAME}. Shape: {df.shape}")
        except Exception as e:
            print(f"❌ Error loading Delta table: {e}")
            raise

        # 2. Features and target
        X = df[["sq_feet", "num_bedrooms", "num_bathrooms", "year_built", "location_score"]]
        y = df["price"]

        # 3. Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # 4. Save DataFrames as Artifacts
        data_to_save = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }
        
        # DataFrames ko pickle karke temporary file mein save karte hain
        for name, data in data_to_save.items():
            temp_file = f"{name}.pkl"
            with open(temp_file, 'wb') as f:
                pickle.dump(data, f)
            
            # MLflow mein artifact ke roop mein log karte hain
            mlflow.log_artifact(temp_file, DATA_ARTIFACT_PATH)
            os.remove(temp_file) # Temporary file ko delete kar dete hain
            print(f"📦 Logged {name} to MLflow artifact path: {DATA_ARTIFACT_PATH}/{name}.pkl")

        print(f"\n--- Data Preparation Complete ---")
        print(f"Data Prep Run ID: {run.info.run_id}")
        
        # Yeh run ID agle step ke liye zaroori hai
        return run.info.run_id 

if __name__ == "__main__":
    prep_run_id = run_data_preparation()
    # Databricks Job mein, yeh ID agle job parameter mein pass kiya jayega
    print(f"💡 The Data Preparation Run ID for next step is: {prep_run_id}")
