In [None]:
# Databricks notebook source
# data_ingest_and_preparation.py

from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

# ==================== CONFIGURATION ====================
DELTA_TABLE_NAME = "house_price_delta"
OUTPUT_DELTA_TABLE = "house_price_scaled_delta"

# ==================== FUNCTIONS ====================

def ingest_data(table_name):
    """Delta table ‡§∏‡•á ‡§°‡•á‡§ü‡§æ ingest ‡§ï‡§∞‡§§‡§æ ‡§π‡•à"""
    print(f"üì• Loading data from Delta table '{table_name}' ...")
    try:
        # spark variable Databricks ‡§Æ‡•á‡§Ç globally available ‡§π‡•à
        df = spark.read.format("delta").table(table_name)
        feature_cols = ["sq_feet", "num_bedrooms", "num_bathrooms", "year_built", "location_score"]
        label_col = "price"

        # Ensure all columns are double for scaler
        for c in feature_cols + [label_col]:
            df = df.withColumn(c, col(c).cast("double"))

        df = df.select(*feature_cols, col(label_col).alias("label"))

        print(f"‚úÖ Data successfully ingested. Total rows: {df.count()}")
        df.printSchema()
        return df, feature_cols
    except Exception as e:
        print(f"‚ùå Data ingestion failed: {e}")
        raise

def prepare_data(df, feature_cols):
    """Feature scaling (StandardScaler) ‡§≤‡§æ‡§ó‡•Ç ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ scaled data ‡§ï‡•ã Delta ‡§Æ‡•á‡§Ç save ‡§ï‡§∞‡§§‡§æ ‡§π‡•à"""
    print("\n‚öôÔ∏è Feature scaling started...")

    # Step 0: Handle missing / NaN values
    print("üßπ Checking and filling missing values...")
    df = df.na.fill(0)

    # Step 1: Assemble features
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vector")

    # Step 2: Apply StandardScaler
    scaler = StandardScaler(
        inputCol="features_vector", 
        outputCol="scaled_features", 
        withMean=True, 
        withStd=True
    )

    # Step 3: Build pipeline
    pipeline = Pipeline(stages=[assembler, scaler])

    try:
        print("üîÑ Fitting scaling pipeline...")
        model = pipeline.fit(df)
        scaled_data = model.transform(df)
        print("‚úÖ Pipeline fitted and transformed successfully")
    except Exception as e:
        print(f"‚ùå Scaling pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        raise

    # Step 4: Extract individual scaled columns
    from pyspark.ml.functions import vector_to_array
    scaled_data = scaled_data.withColumn("scaled_array", vector_to_array(col("scaled_features")))

    for i, c in enumerate(feature_cols):
        scaled_data = scaled_data.withColumn(f"{c}_scaled", col("scaled_array")[i])

    # Step 5: Select final columns
    final_cols = [f"{c}_scaled" for c in feature_cols] + ["label"]
    scaled_final_df = scaled_data.select(*final_cols)

    print("‚úÖ Feature scaling completed successfully.")
    scaled_final_df.show(5)

    # Step 6: Save to Delta table (overwrite mode)
    try:
        print(f"üíæ Saving scaled data to Delta table '{OUTPUT_DELTA_TABLE}'...")
        scaled_final_df.write.format("delta").mode("overwrite").saveAsTable(OUTPUT_DELTA_TABLE)
        print(f"‚úÖ Scaled data successfully saved to Delta table '{OUTPUT_DELTA_TABLE}'")
    except Exception as e:
        print(f"‚ùå Failed to save scaled data to Delta: {e}")
        raise

    return scaled_final_df

# ==================== EXECUTION ====================
def main():
    print("=" * 70)
    print("üöÄ DATA INGESTION AND PREPARATION PIPELINE")
    print("=" * 70)
    
    try:
        # Verify spark session is available
        print(f"‚úÖ Using Databricks SparkSession: {spark.version}")
    except NameError:
        print("‚ùå Error: 'spark' variable not found!")
        print("This script must be run in a Databricks notebook")
        raise Exception("SparkSession not available")

    # Step 1: Data ingestion
    ingested_df, feature_cols = ingest_data(DELTA_TABLE_NAME)

    # Step 2: Feature scaling and Delta save
    scaled_df = prepare_data(ingested_df, feature_cols)

    print("\n" + "=" * 70)
    print("üéâ DATA PIPELINE COMPLETED SUCCESSFULLY")
    print("=" * 70)
    print(f"‚úÖ Scaled data available in: {OUTPUT_DELTA_TABLE}")
    print(f"‚úÖ Total records processed: {scaled_df.count()}")
    print("=" * 70)

# ==================== MAIN EXECUTION ====================
try:
    main()
except Exception as e:
    print("\n" + "=" * 70)
    print("‚ùå PIPELINE FAILED")
    print("=" * 70)
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    
    # Fail the Databricks job
    try:
        dbutils.notebook.exit("FAILED")
    except:
        raise  # Re-raise for non-notebook environments