In [None]:
# data_ingest_and_preparation.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

# ==================== CONFIGURATION ====================
DELTA_TABLE_NAME = "house_price_delta"
OUTPUT_DELTA_TABLE = "house_price_scaled_delta"

# ==================== FUNCTIONS ====================

def initialize_spark():
    """SparkSession initialize ‡§ï‡§∞‡§§‡§æ ‡§π‡•à (Databricks job compatible)"""
    try:
        spark = SparkSession.builder \
            .appName("DataIngestAndPreparation") \
            .getOrCreate()
        print("‚úÖ SparkSession initialized successfully")
        return spark
    except Exception as e:
        print(f"‚ùå SparkSession initialization failed: {e}")
        raise

def ingest_data(spark, table_name):
    """Delta table ‡§∏‡•á ‡§°‡•á‡§ü‡§æ ingest ‡§ï‡§∞‡§§‡§æ ‡§π‡•à"""
    print(f"üì• Loading data from Delta table '{table_name}' ...")
    try:
        df = spark.read.format("delta").table(table_name)
        feature_cols = ["sq_feet", "num_bedrooms", "num_bathrooms", "year_built", "location_score"]
        label_col = "price"

        # Ensure all columns are double for scaler
        for c in feature_cols + [label_col]:
            df = df.withColumn(c, col(c).cast("double"))

        df = df.select(*feature_cols, col(label_col).alias("label"))

        print(f"‚úÖ Data successfully ingested. Total rows: {df.count()}")
        df.printSchema()
        return df, feature_cols
    except Exception as e:
        print(f"‚ùå Data ingestion failed: {e}")
        raise

def prepare_data(spark, df, feature_cols):
    """Feature scaling (StandardScaler) ‡§≤‡§æ‡§ó‡•Ç ‡§ï‡§∞‡§§‡§æ ‡§π‡•à ‡§î‡§∞ scaled data ‡§ï‡•ã Delta ‡§Æ‡•á‡§Ç save ‡§ï‡§∞‡§§‡§æ ‡§π‡•à"""
    print("\n‚öôÔ∏è Feature scaling started...")

    # Step 0: Handle missing / NaN values
    print("üßπ Checking and filling missing values...")
    df = df.na.fill(0)

    # Step 1: Assemble features
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vector")

    # Step 2: Apply StandardScaler
    scaler = StandardScaler(
        inputCol="features_vector", outputCol="scaled_features", withMean=True, withStd=True
    )

    # Step 3: Build pipeline
    pipeline = Pipeline(stages=[assembler, scaler])

    try:
        model = pipeline.fit(df)
        scaled_data = model.transform(df)
    except Exception as e:
        print(f"‚ùå Scaling pipeline failed: {e}")
        raise

    # Step 4: Extract individual scaled columns
    from pyspark.ml.functions import vector_to_array
    scaled_data = scaled_data.withColumn("scaled_array", vector_to_array(col("scaled_features")))

    for i, c in enumerate(feature_cols):
        scaled_data = scaled_data.withColumn(f"{c}_scaled", col("scaled_array")[i])

    # Step 5: Select final columns
    final_cols = [f"{c}_scaled" for c in feature_cols] + ["label"]
    scaled_final_df = scaled_data.select(*final_cols)

    print("‚úÖ Feature scaling completed successfully.")
    scaled_final_df.show(5)

    # Step 6: Save to Delta table (overwrite mode)
    try:
        scaled_final_df.write.format("delta").mode("overwrite").saveAsTable(OUTPUT_DELTA_TABLE)
        print(f"‚úÖ Scaled data successfully saved to Delta table '{OUTPUT_DELTA_TABLE}'")
    except Exception as e:
        print(f"‚ùå Failed to save scaled data to Delta: {e}")
        raise

    return scaled_final_df

# ==================== EXECUTION ====================
def main():
    print("üöÄ Starting data ingestion and preparation job...")
    
    # Always initialize SparkSession at the start
    spark_session = initialize_spark()

    # Step 1: Data ingestion
    ingested_df, feature_cols = ingest_data(spark_session, DELTA_TABLE_NAME)

    # Step 2: Feature scaling and Delta save
    scaled_df = prepare_data(spark_session, ingested_df, feature_cols)

    print("üéØ Data ingestion and preparation pipeline completed successfully.")

if __name__ == "__main__":
    main()
