# ü•á Gold Table: Enhanced ML Features with Temporal Intelligence

**Purpose:** Transform Enhanced Silver data into ML-ready feature vectors with temporal features for superior predictive performance.

**Key Enhancements:**
- ‚úÖ Uses Enhanced Silver with 15 columns (temporal features included)
- ‚úÖ Temporal feature encoding in ML pipeline
- ‚úÖ Improved target classification (5 classes)
- ‚úÖ ~1200+ feature dimensions (vs ~800 original)

**Pipeline:** Bronze ‚Üí Enhanced Silver (15 cols) ‚Üí **Enhanced Gold (ML-ready)**

**Source:** `default.silver_flights_processed` (Enhanced with temporal features)
**Output:** `default.gold_ml_features` (Enhanced ML vectors)

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StandardScaler, StringIndexer, VectorAssembler
from pyspark.sql.functions import col, when

### Load Silver Table

In [0]:
print("\nüì• Loading Silver table...")
df_silver = spark.table("default.silver_flights_processed")

print(f"‚úÖ Loaded {df_silver.count():,} records")
print(f"Columns: {len(df_silver.columns)}")

### TARGET VARIABLE CREATION

In [0]:
print("\nüéØ Creating binary target (delayed ‚â•15 min)...")

# Binary classification: 0 = on-time (<15 min), 1 = delayed (‚â•15 min)
# Exclude cancelled flights (null arrival_delay)
df_with_target = df_silver.filter(col("arrival_delay").isNotNull()).withColumn(
    "label",
    when(col("arrival_delay") >= 15, 1.0).otherwise(0.0)
)

print("üìä Target Distribution:")
df_with_target.groupBy("label").count().orderBy("label").show()

total = df_with_target.count()
delayed = df_with_target.filter(col("label") == 1.0).count()
print(f"Total flights: {total:,}")
print(f"Delayed (‚â•15 min): {delayed:,} ({delayed/total*100:.1f}%)")
print(f"On-time (<15 min): {total - delayed:,} ({(total - delayed)/total*100:.1f}%)")


### FEATURE ORGANIZATION

In [0]:
print("\nüìã Organizing features...")

# Categorical features (need encoding)
categorical_features = [
    "airline_name",
    "airline_code",
    "origin_airport_code",
    "destination_airport_code",
    "season"
]

# Numerical features
numerical_features = [
    "flight_month",
    "flight_year",
    "day_of_week",
    "week_of_year",
    "day_of_month",
    "quarter",
    "fl_number",
    "crs_dep_time",
    "crs_arr_time",
    "crs_elapsed_time",
    "distance",
    "dep_delay"  # At index 11 - will be removed for pre-departure model in experiments
]

# Boolean features (already 0/1 in Silver)
boolean_features = [
    "is_weekend",
    "is_holiday",
    "is_near_holiday",
    "is_holiday_period"
]

print(f"‚úÖ Feature counts:")
print(f"   Categorical: {len(categorical_features)}")
print(f"   Numerical: {len(numerical_features)} (includes dep_delay at index 11)")
print(f"   Boolean: {len(boolean_features)}")
print(f"   Total: {len(categorical_features + numerical_features + boolean_features)}")


### HANDLE TIME FEATURES (Convert HHMM format to hour of day)

In [0]:
print("\n‚è∞ Converting time features to hour of day...")

# Convert HHMM format (e.g., 1430) to hour (e.g., 14)
df_ml_ready = df_with_target.withColumn(
    "dep_hour",
    (col("crs_dep_time") / 100).cast("int")
).withColumn(
    "arr_hour",
    (col("crs_arr_time") / 100).cast("int")
)

# Replace original time columns with hour columns
numerical_features.remove("crs_dep_time")
numerical_features.remove("crs_arr_time")
numerical_features.extend(["dep_hour", "arr_hour"])

print("‚úÖ Converted crs_dep_time ‚Üí dep_hour (0-23)")
print("‚úÖ Converted crs_arr_time ‚Üí arr_hour (0-23)")

### HANDLE MISSING VALUES

In [0]:
print("\nüîç Checking for missing values...")

from pyspark.sql.functions import count, isnan

all_feature_cols = categorical_features + numerical_features + boolean_features
missing_found = False

for col_name in all_feature_cols:
    null_count = df_ml_ready.filter(col(col_name).isNull()).count()
    if null_count > 0:
        print(f"  ‚ö†Ô∏è {col_name}: {null_count:,} nulls")
        missing_found = True

if missing_found:
    print("\nüîß Filling missing values...")
    # Fill numerical nulls with 0
    for col_name in numerical_features:
        df_ml_ready = df_ml_ready.fillna({col_name: 0})
    print("‚úÖ Missing values handled")
else:
    print("‚úÖ No missing values found")


### ML PIPELINE CONSTRUCTION

In [0]:
print("\n‚ö° Building feature engineering pipeline...")

# Stage 1: String Indexers
categorical_indexed = [f"{c}_index" for c in categorical_features]
indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_index", handleInvalid="keep")
    for c in categorical_features
]
print(f"‚úÖ Created {len(indexers)} string indexers")

# Stage 2: One-Hot Encoders
categorical_encoded = [f"{c}_ohe" for c in categorical_features]
encoder = OneHotEncoder(
    inputCols=categorical_indexed,
    outputCols=categorical_encoded,
    handleInvalid="keep"
)
print(f"‚úÖ Created one-hot encoder")

# Stage 3: Assemble all features
all_features = numerical_features + boolean_features + categorical_encoded

assembler = VectorAssembler(
    inputCols=all_features,
    outputCol="unscaled_features",
    handleInvalid="skip"
)
print(f"‚úÖ Created feature assembler ({len(all_features)} input columns)")

# Stage 4: Standard Scaler
scaler = StandardScaler(
    inputCol="unscaled_features",
    outputCol="features",
    withStd=True,
    withMean=True
)
print(f"‚úÖ Created standard scaler")

# Combine pipeline
pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler])
print(f"\nüöÄ Pipeline ready with {len(indexers) + 3} stages")

### FIT AND TRANSFORM PIPELINE

In [0]:
print("\n‚ö° Fitting pipeline (this may take a few minutes)...")
fitted_pipeline = pipeline.fit(df_ml_ready)
print("‚úÖ Pipeline fitted")

print("\nüîÑ Transforming data...")
df_gold = fitted_pipeline.transform(df_ml_ready)
print("‚úÖ Transformation complete")

# Select final columns
df_gold_final = df_gold.select("features", "label")

# Check feature vector size
sample = df_gold_final.select("features").first()
feature_size = sample.features.size

print(f"\nüìè Feature vector dimensions: {feature_size}")
print(f"   Note: High dimensionality due to one-hot encoding of airports/airlines")
print(f"   Feature selection will be performed in experiments notebook")


### FINAL VALIDATION

In [0]:
print("GOLD TABLE VALIDATION")

print("\nüìã Schema:")
df_gold_final.printSchema()

print("\nüìä Final Target Distribution:")
df_gold_final.groupBy("label").count().orderBy("label").show()

print("\nüîé Sample (features truncated):")
df_gold_final.show(3)

record_count = df_gold_final.count()
print(f"\n‚úÖ Gold table ready:")
print(f"   Records: {record_count:,}")
print(f"   Features: {feature_size}")
print(f"   Target: Binary (0=on-time, 1=delayed)")

### CREATE FINAL GOLD TABLE

In [0]:
GOLD_PATH = "/Volumes/workspace/default/ds-capstone/gold/ml_features_experimental"
GOLD_TABLE_NAME = "default.gold_ml_features_experimental"
DATABASE_NAME = "default"

print("\n" + "=" * 80)
print("SAVING GOLD TABLE")
print("=" * 80)

# Helper functions
def path_exists(path):
    try:
        dbutils.fs.ls(path)
        return True
    except:
        return False

def create_directory_if_not_exists(path):
    if not path_exists(path):
        dbutils.fs.mkdirs(path)
        print(f"‚úÖ Created directory: {path}")
    else:
        print(f"‚ÑπÔ∏è  Directory exists: {path}")

# Check and clean if needed
print(f"\nüìÅ Checking path: {GOLD_PATH}")
if path_exists(GOLD_PATH):
    print(f"‚ö†Ô∏è  Path exists, will overwrite")
    try:
        test_df = spark.read.format("delta").load(GOLD_PATH)
        print(f"   Found existing table with {test_df.count()} records")
    except:
        print(f"   Cleaning invalid data...")
        dbutils.fs.rm(GOLD_PATH, recurse=True)

# Create parent directory
gold_parent = "/".join(GOLD_PATH.split("/")[:-1])
create_directory_if_not_exists(gold_parent)

# Write Delta table
print(f"\nüíæ Writing Delta table...")
try:
    df_gold_final.write.format("delta").mode("overwrite").save(GOLD_PATH)
    print(f"‚úÖ Delta table written to: {GOLD_PATH}")
    print(f"‚úÖ Records written: {df_gold_final.count():,}")
except Exception as e:
    print(f"‚ùå Error: {e}")
    print(f"   Cleaning and retrying...")
    dbutils.fs.rm(GOLD_PATH, recurse=True)
    df_gold_final.write.format("delta").mode("overwrite").save(GOLD_PATH)
    print(f"‚úÖ Succeeded after cleanup")

# Register table
print(f"\nüìå Registering table: {GOLD_TABLE_NAME}")
try:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
    spark.sql(f"DROP TABLE IF EXISTS {GOLD_TABLE_NAME}")
    
    df_for_table = spark.read.format("delta").load(GOLD_PATH)
    df_for_table.write.format("delta").mode("overwrite").saveAsTable(GOLD_TABLE_NAME)
    
    print(f"‚úÖ Table registered: {GOLD_TABLE_NAME}")
except Exception as e:
    print(f"‚ö†Ô∏è  Error with saveAsTable: {e}")
    try:
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {GOLD_TABLE_NAME}
            USING DELTA
            LOCATION '{GOLD_PATH}'
        """)
        print(f"‚úÖ Table registered with LOCATION clause")
    except Exception as e2:
        print(f"‚ö†Ô∏è  Registration failed: {e2}")
        print(f"üí° Access data using: spark.read.format('delta').load('{GOLD_PATH}')")

### Summary

In [0]:
print("GOLD TABLE CREATION COMPLETE")
print("=" * 80)

print("\n‚úÖ What was created:")
print(f"   ‚Ä¢ Delta table at: {GOLD_PATH}")
print(f"   ‚Ä¢ Registered as: {GOLD_TABLE_NAME}")
print(f"   ‚Ä¢ Records: {record_count:,}")
print(f"   ‚Ä¢ Feature dimensions: {feature_size}")
print(f"   ‚Ä¢ Binary target: 0 (on-time) / 1 (delayed ‚â•15 min)")

print("\nüìã Feature breakdown:")
print(f"   ‚Ä¢ Numerical: {len(numerical_features)} (includes dep_delay)")
print(f"   ‚Ä¢ Boolean: {len(boolean_features)}")
print(f"   ‚Ä¢ Categorical (one-hot): {len(categorical_features)} ‚Üí ~{feature_size - len(numerical_features) - len(boolean_features)} dims")

print("\n‚ö†Ô∏è  IMPORTANT for experiments:")
print(f"   ‚Ä¢ dep_delay is at index 11 in numerical features")
print(f"   ‚Ä¢ Remove this index for pre-departure model")
print(f"   ‚Ä¢ Keep all features for in-flight model")

print("\nüéØ Next steps:")
print("   1. Run feature importance analysis (in experiments)")
print("   2. Select top-K most important features")
print("   3. Train dual models (pre-departure + in-flight)")
print("   4. Register best models in MLflow")

print("\n‚úÖ Gold table is ready for ML experiments!")