### Prerequistes:

You wil need to create a Secrets scope and key within databricks-CLI which can be done from a bash or powershell terminal

In [0]:
!pip install tabulate

In [0]:
import requests
import pandas as pd
import json
from datetime import datetime
from pyspark.sql import SparkSession 
import numpy as np
import re

# --- 1. CONFIGURATION ---
API_KEY = dbutils.secrets.get(scope="my-secrets", key="aviation_stack_api") #change this to your scope and key
BASE_URL = "https://api.aviationstack.com/v1"

# --- 2. GET USER INPUT FROM WIDGETS (Unchanged) ---
# ... (Widget code from original notebook remains here) ...
dbutils.widgets.removeAll() # Recommended practice to clear old widgets before defining new ones
dbutils.widgets.text(name="p_flight_icao", defaultValue="DAL1234", label="Flight ICAO Code")
dbutils.widgets.text(name="p_departure_date", defaultValue="2025-11-30", label="Departure Date (YYYY-MM-DD)")
dbutils.widgets.dropdown(name="p_current_stage", choices=["Pre-Departure", "In-Flight"], defaultValue="Pre-Departure", label="Current Prediction Stage")

p_flight_icao = dbutils.widgets.get("p_flight_icao")
p_departure_date = dbutils.widgets.get("p_departure_date")
p_current_stage = dbutils.widgets.get("p_current_stage")
CLEAN_FLIGHT_CODE = p_flight_icao.replace(" ", "").upper()

print(f"Fetching data for flight code: {CLEAN_FLIGHT_CODE} on date: {p_departure_date}")
# ---------------------------------------------


# --- 3. DYNAMIC CONFIGURATION (NEW: Using data from Gold Table and Selection) ---

# 1. ACTUAL SELECTED FEATURES (Ordered by importance/index from selected_indices.csv)
# Indices have been mapped to their original feature name or generic OHE placeholder.
IN_FLIGHT_FEATURES = [
    'dep_delay', 'dep_hour', 'arr_hour', 'OHE_DIMENSION_47', 'OHE_DIMENSION_23', 
    'OHE_DIMENSION_817', 'OHE_DIMENSION_42', 'OHE_DIMENSION_43', 'OHE_DIMENSION_451', 
    'week_of_year', 'flight_month', 'OHE_DIMENSION_814', 'flight_year', 
    'OHE_DIMENSION_30', 'OHE_DIMENSION_444', 'OHE_DIMENSION_19', 'OHE_DIMENSION_36', 
    'OHE_DIMENSION_17', 'OHE_DIMENSION_27', 'distance', 'OHE_DIMENSION_28', 
    'OHE_DIMENSION_37', 'OHE_DIMENSION_484', 'OHE_DIMENSION_16', 'crs_elapsed_time', 
    'OHE_DIMENSION_434', 'quarter', 'fl_number', 'OHE_DIMENSION_54', 'OHE_DIMENSION_35', 
    'OHE_DIMENSION_24', 'OHE_DIMENSION_52', 'OHE_DIMENSION_57', 'day_of_week', 
    'OHE_DIMENSION_816', 'OHE_DIMENSION_38', 'is_holiday_period', 'OHE_DIMENSION_50', 
    'OHE_DIMENSION_46', 'OHE_DIMENSION_73'
]

# 2. PRE-DEPARTURE FEATURES (EXCLUDING 'dep_delay' at index 9)
DEP_DELAY_FEATURE_NAME = 'dep_delay'
PRE_DEP_FEATURES = [f for f in IN_FLIGHT_FEATURES if f != DEP_DELAY_FEATURE_NAME]

# 3. MODEL REGISTRY MAPPING (Use specific model names for each stage)
MODEL_MAP = {
    "Pre-Departure": {
        "features": PRE_DEP_FEATURES,
        "name": "flight_delay_predictor_pre_dep" # Assumes you registered two models
    },
    "In-Flight": {
        "features": IN_FLIGHT_FEATURES,
        "name": "flight_delay_predictor_in_flight" # Assumes you registered two models
    }
}
# ---------------------------------------------

# --- 4. DATA FETCHING (Unchanged from original code) ---

def fetch_live_flight_data(base_url, api_key, flight_code, departure_date):
    """Fetches specific flight data from Aviationstack API..."""
    # ... (function body for fetch_live_flight_data remains the same) ...
    # This function is correct.

    # Parse flight code (e.g., "DL1585" -> "DL" + "1585")
    match = re.match(r"([A-Za-z]+)(\d+)", flight_code)
    if not match:
        print("‚ùå Could not parse flight code into airline code and flight number.")
        return pd.DataFrame()

    airline = match.group(1)
    number = match.group(2)

    endpoint = "/flights"

    # Valid free-tier attempts (ordered by likelihood of success on restricted tiers)
    attempts = [
        # 1. Separate components: airline_iata + flight_number (often the most reliable on lower tiers)
        {"airline_iata": airline, "flight_number": number},
        
        # 2. Combined IATA code
        {"flight_iata": flight_code},
        
        # 3. Combined ICAO code (the original failing query format, included as a final try)
        {"flight_icao": flight_code},
    ]
    
    flights_data = []

    for attempt in attempts:
        # Include required parameters: access_key, flight_date, limit=1
        params = {"access_key": api_key, "flight_date": departure_date, "limit": 1, **attempt}
        
        print(f"Trying query: {list(attempt.keys())[0]}...")

        try:
            response = requests.get(f"{base_url}{endpoint}", params=params)
            response.raise_for_status() 
            data = response.json()
        except requests.exceptions.RequestException as e:
            # Handle connectivity or HTTP status errors
            print(f"‚ùå API Request Failed with {e}")
            continue

        # Detect Aviationstack application errors (e.g., 403 status might still return JSON with 'error')
        if "error" in data:
            print(f"‚ùå API Error for attempt {list(attempt.keys())[0]}: {data['error']['message']}")
            continue

        flights = data.get("data", [])
        if flights:
            print(f"‚úÖ SUCCESS: Data retrieved using {list(attempt.keys())[0]} filter.")
            flights_data = flights
            break # Exit loop on first successful query

    if not flights_data:
        print("‚ùå No valid data found for any API query method.")
        return pd.DataFrame()

    # --- DATA EXTRACTION & FLATTENING (Original logic to preserve downstream schema) ---
    extracted_flights = []
    for flight in flights_data:
        # ... (rest of the data extraction/flattening remains the same) ...
        flight_info = flight.get('flight', {}) or {}
        airline_info = flight.get('airline', {}) or {}
        departure_info = flight.get('departure', {}) or {}
        arrival_info = flight.get('arrival', {}) or {}
        aircraft_info = flight.get('aircraft', {}) or {}
        live_info = flight.get('live', {}) or {}

        flight_record = {
            'flight_date': flight.get('flight_date'),
            'flight_status': flight.get('flight_status'),
            
            # Flight Info
            'flight_iata': flight_info.get('iata'),
            'flight_icao': flight_info.get('icao'),
            'flight_number': flight_info.get('number'),
            
            # Airline Info
            'airline_name': airline_info.get('name'),
            'airline_iata': airline_info.get('iata'),
            'airline_icao': airline_info.get('icao'),
            
            # Departure Info (Crucial for Pre-Departure Model)
            'departure_airport': departure_info.get('airport'),
            'departure_iata': departure_info.get('iata'),
            'departure_icao': departure_info.get('icao'),
            'departure_delay': departure_info.get('delay'), # The raw feature you need
            'departure_scheduled': departure_info.get('scheduled'),
            'departure_estimated': departure_info.get('estimated'),
            'departure_actual': departure_info.get('actual'),
            'departure_timezone': departure_info.get('timezone'),
            
            # Arrival Info
            'arrival_airport': arrival_info.get('airport'),
            'arrival_iata': arrival_info.get('iata'),
            'arrival_icao': arrival_info.get('icao'),
            'arrival_scheduled': arrival_info.get('scheduled'),
            
            # Live Data (Crucial for In-Flight Model)
            'live_latitude': live_info.get('latitude'),
            'live_longitude': live_info.get('longitude'),
            'live_altitude': live_info.get('altitude'),
            'live_speed_horizontal': live_info.get('speed_horizontal'),
            'live_is_ground': live_info.get('is_ground')
        }
        extracted_flights.append(flight_record)
        
    df = pd.DataFrame(extracted_flights)
    print(f"Successfully extracted {len(df)} flight record(s).")
    return df

# --- 5. CORE EXECUTION ---
live_flight_df = fetch_live_flight_data(BASE_URL, API_KEY, CLEAN_FLIGHT_CODE, p_departure_date)

if not live_flight_df.empty:
    print("\n--- Fetched Data Head ---\n")
    print(live_flight_df.head().to_markdown(index=False))

# --- 6. FEATURE ENGINEERING (Corrected for Data Flow) ---

# WARNING: This simplified function is sufficient for the NON-OHE features, 
# but it CANNOT correctly generate the 819-dimension OHE features 
# without the learned vocabulary (StringIndexer) from your training pipeline.

# *** CRITICAL ACTION ***
# To make this production-ready, you must:
# 1. Save the fitted Spark ML Pipeline (or at least the StringIndexer & OneHotEncoder stages) 
#    from the Gold table creation notebook to a known location (e.g., an MLflow artifact).
# 2. Load those stages here and use them to transform the live flight data.

def engineer_live_features_simplified(live_df, required_features, dep_delay_name):
    """
    Transforms raw API DataFrame into a feature vector using a simplified mapping 
    for the selected NON-OHE features. OHE features are created as placeholders.
    """
    if live_df.empty:
        return pd.DataFrame()

    flight = live_df.iloc[0].copy() 
    
    # 1. TEMPORAL & NUMERICAL FEATURES MAPPING
    features = {}
    
    # Map API fields to the 16 non-OHE features defined by your Gold table pipeline
    scheduled_time_str = flight['departure_scheduled']
    try:
        scheduled_dt = datetime.fromisoformat(scheduled_time_str.replace('Z', '+00:00'))
    except (ValueError, TypeError):
        print("Warning: Could not parse scheduled departure time. Defaulting to now.")
        scheduled_dt = datetime.now() 

    # Numerical features derived from API
    features['flight_month'] = scheduled_dt.month
    features['flight_year'] = scheduled_dt.year
    features['day_of_week'] = scheduled_dt.weekday() 
    features['week_of_year'] = scheduled_dt.isocalendar()[1]
    features['day_of_month'] = scheduled_dt.day
    features['quarter'] = (scheduled_dt.month - 1) // 3 + 1
    
    # This is a proxy for fl_number. Your real pipeline would need a mapping.
    # Assuming 'flight_number' API field corresponds to 'fl_number' in your gold table.
    features['fl_number'] = float(flight.get('flight_number')) if flight.get('flight_number') is not None else 0.0

    # These features are typically computed/available in your Silver layer.
    # We use reasonable approximations based on common flight data sources.
    # CRITICAL: Missing crs_elapsed_time and distance in API; must be looked up in a static table. 
    # For now, setting to 0.0 to prevent a crash.
    features['crs_elapsed_time'] = 0.0 # Must be looked up!
    features['distance'] = 0.0 # Must be looked up!

    # The key delay feature
    raw_delay = flight.get('departure_delay')
    features[dep_delay_name] = float(raw_delay) if raw_delay is not None else 0.0
    
    # Time features conversion (HHMM to hour) - must be done carefully
    features['dep_hour'] = scheduled_dt.hour
    
    # Boolean features (assuming default values for API data)
    # The API doesn't provide these, so we set them to their most common value (0) 
    # or require a lookup table for holidays.
    features['is_weekend'] = 1 if scheduled_dt.weekday() >= 5 else 0
    features['is_holiday'] = 0
    features['is_near_holiday'] = 0
    features['is_holiday_period'] = 0
    
    # 2. OHE FEATURES (Placeholder)
    # This must be replaced with the actual OHE logic from your fitted pipeline.
    # For the simulation to work, we create 0-filled columns for the selected OHE dimensions.
    # The actual values would only be 1 in a single position if the category matches.
    ohe_columns = [f for f in required_features if f.startswith('OHE_DIMENSION')]
    for ohe_col in ohe_columns:
        # Assuming the vast majority of OHE features are 0 (i.e., this flight doesn't match that category/dimension)
        features[ohe_col] = 0.0 

    # 3. Final Assembly
    engineered_df = pd.DataFrame([features])

    # 4. FINAL SELECTION & ORDERING
    # This step is the most critical: it ensures the final vector matches the model's required input shape.
    final_feature_vector = engineered_df.reindex(columns=required_features, fill_value=0)
    
    return final_feature_vector


# --- 7. DYNAMIC EXECUTION ---
# Get dynamic config based on widget input
current_config = MODEL_MAP[p_current_stage]
REQUIRED_FEATURES = current_config["features"]
MODEL_NAME = current_config["name"]
MODEL_STAGE = "Staging" # Or 'Production' if you promoted it

print(f"\n--- Dynamic Configuration ---")
print(f"Prediction Stage: {p_current_stage}")
print(f"Model to Load: {MODEL_NAME}")
print(f"Feature Vector Size: {len(REQUIRED_FEATURES)}")
print(f"---------------------------\n")

try:
    # Use the corrected feature engineering function
    live_features_df = engineer_live_features_simplified(
        live_flight_df, 
        REQUIRED_FEATURES, 
        DEP_DELAY_FEATURE_NAME
    )
    print(f"\n--- Engineered Feature Vector Shape: {live_features_df.shape} ---")
    print(f"Features used: {len(live_features_df.columns)}")
    print(f"Features (First 5): {live_features_df.columns.tolist()[:5]}")
    
except Exception as e:
    print(f"Feature Engineering Failed: {e}")
    live_features_df = pd.DataFrame() 

# --- 8. MODEL LOADING AND PREDICTION (Now Dynamic) ---

if not live_features_df.empty:
    print(f"\nLoading Model '{MODEL_NAME}' from stage '{MODEL_STAGE}'...")
    try:
        import mlflow.pyfunc
        
        # Load the model dynamically based on the current stage's config
        logged_model_uri = f"models:/{MODEL_NAME}/{MODEL_STAGE}"
        predictor = mlflow.pyfunc.load_model(logged_model_uri)
        
        # Run prediction on the engineered feature vector
        prediction_result = predictor.predict(live_features_df)
        
        predicted_delay_minutes = prediction_result[0]
        
        print(f"\n‚úÖ Prediction Successful!")
        print(f"‚úàÔ∏è Predicted Delay: {predicted_delay_minutes:.1f} minutes")
        
        # Display Final Prediction
        html_output = f"""
        <div style='padding: 20px; border: 2px solid #3b82f6; border-radius: 12px; background-color: #e0f2fe; text-align: center;'>
            <h1 style='color: #1d4ed8; font-size: 1.5rem; margin-bottom: 5px;'>Flight Delay Prediction ({p_current_stage})</h1>
            <p style='color: #0c4a6e; font-size: 2.5rem; font-weight: bold;'>{predicted_delay_minutes:.1f} minutes</p>
            <p style='color: #0c4a6e; font-size: 1.0rem;'>Predicted delay for flight {p_flight_icao} on {p_departure_date}.</p>
        </div>
        """
        # NOTE: displayHTML is a Databricks utility, here it is used conceptually.
        # displayHTML(html_output)
        print("\n--- HTML Dashboard Output Generated ---")
        
    except Exception as e:
        print(f"MLflow Model Loading or Prediction Failed: {e}")
        error_html = f"""
        <div style='padding: 20px; border: 2px solid #ef4444; border-radius: 12px; background-color: #fee2e2; text-align: center;'>
            <h1 style='color: #b91c1c; font-size: 1.5rem; margin-bottom: 5px;'>Prediction Failed</h1>
            <p style='color: #7f1d1d; font-size: 1.0rem;'>Failed to load or run model **{MODEL_NAME}**. Check model registry and feature alignment. Error: {str(e)}</p>
        </div>
        """
        # displayHTML(error_html)
        print("\n--- HTML ERROR: Prediction Failed ---")
else:
    error_html = f"""
    <div style='padding: 20px; border: 2px solid #fb923c; border-radius: 12px; background-color: #fff7ed; text-align: center;'>
        <h1 style='color: #c2410c; font-size: 1.5rem; margin-bottom: 5px;'>Data Retrieval Failed</h1>
        <p style='color: #7c2d12; font-size: 1.0rem;'>Could not fetch flight data from the API or feature engineering failed. Check API inputs and connection.</p>
    </div>
    """
    # displayHTML(error_html)
    print("\n--- HTML ERROR: Data Retrieval Failed ---")

In [0]:
# Cell 1: Imports and Configuration
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, udf
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
import mlflow
import mlflow.spark
import json
from typing import Dict, List, Tuple, Optional

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

print("‚úÖ Imports complete")

In [0]:
# Cell 2: Configuration Class
class APIConfig:
    """Configuration for Aviation Stack API and MLflow"""
    
    # API Configuration
    API_KEY = dbutils.secrets.get(scope="my-secrets", key="aviation_stack_api")
    BASE_URL = "https://api.aviationstack.com/v1"
    
    # MLflow Configuration
    MLFLOW_TRACKING_URI = "databricks"
    MLFLOW_REGISTRY_URI = "databricks"
    EXPERIMENT_NAME = "/Shared/Flightmasters_Optimized_Experiments"
    
    # Model Names (from your experiments)
    MODEL_RF_PRE = "model_rf_pre"
    MODEL_GBT_PRE = "model_gbt_pre"
    MODEL_RF_IN = "model_rf_in"
    MODEL_GBT_IN = "model_gbt_in"
    
    # Feature Engineering
    SELECTED_INDICES = [9, 10, 11, 47, 23, 817, 42, 43, 451, 3, 0, 814, 1, 30, 444, 
                        19, 36, 17, 27, 8, 28, 37, 484, 16, 7, 434, 5, 6, 54, 35, 
                        24, 52, 57, 2, 816, 38, 15, 50, 46, 73]
    
    DEP_DELAY_ORIGINAL_INDEX = 11  # Position in numerical features
    TOP_K_FEATURES = 40
    
    # Delta Table Paths
    PREDICTIONS_TABLE = "default.flight_predictions"
    PREDICTIONS_PATH = "/Volumes/workspace/default/ds-capstone/predictions/api_predictions"
    
    # Gold Table (for feature engineering reference)
    GOLD_TABLE = "default.gold_ml_features_experimental"

print("‚úÖ Configuration loaded")

In [0]:
# Cell 3: Aviation Stack API Client
class AviationStackClient:
    """Client for Aviation Stack API calls"""
    
    def __init__(self, api_key: str, base_url: str):
        self.api_key = api_key
        self.base_url = base_url
    
    def search_flight(self, flight_iata: str = None, flight_date: str = None, 
                      dep_iata: str = None, arr_iata: str = None) -> Dict:
        """
        Search for flight information
        
        Args:
            flight_iata: Flight number (e.g., 'AA100')
            flight_date: Date in YYYY-MM-DD format
            dep_iata: Departure airport code (e.g., 'JFK')
            arr_iata: Arrival airport code (e.g., 'LAX')
        """
        endpoint = f"{self.base_url}/flights"
        
        params = {
            'access_key': self.api_key,
            'limit': 100
        }
        
        # Add search parameters
        if flight_iata:
            params['flight_iata'] = flight_iata
        if flight_date:
            params['flight_date'] = flight_date
        if dep_iata:
            params['dep_iata'] = dep_iata
        if arr_iata:
            params['arr_iata'] = arr_iata
        
        try:
            response = requests.get(endpoint, params=params, timeout=30)
            response.raise_for_status()
            data = response.json()
            
            print(f"‚úÖ API call successful: {len(data.get('data', []))} flights found")
            return data
        
        except requests.exceptions.RequestException as e:
            print(f"‚ùå API Error: {e}")
            return {'data': [], 'error': str(e)}
    
    def get_flight_by_number(self, flight_number: str, date: str = None) -> Optional[Dict]:
        """Get specific flight by flight number"""
        if date is None:
            date = datetime.now().strftime('%Y-%m-%d')
        
        result = self.search_flight(flight_iata=flight_number, flight_date=date)
        flights = result.get('data', [])
        
        if flights:
            return flights[0]
        return None
    
    def get_route_flights(self, dep_iata: str, arr_iata: str, date: str = None) -> List[Dict]:
        """Get all flights for a specific route"""
        if date is None:
            date = datetime.now().strftime('%Y-%m-%d')
        
        result = self.search_flight(dep_iata=dep_iata, arr_iata=arr_iata, flight_date=date)
        return result.get('data', [])

print("‚úÖ Aviation Stack Client defined")

In [0]:
# Cell 4: Feature Engineering for API Data
class FlightFeatureEngineer:
    """
    Feature engineering to match Gold table structure (819 features)
    Must align with your Gold table feature engineering pipeline
    """
    
    def __init__(self):
        # Load Gold table schema for reference
        try:
            self.gold_sample = spark.table(APIConfig.GOLD_TABLE).limit(1)
            self.total_features = self.gold_sample.select("features").first().features.size
            print(f"‚úÖ Gold table features: {self.total_features}")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not load Gold table: {e}")
            self.total_features = 819
    
    def parse_api_flight(self, flight_data: Dict) -> Dict:
        """Extract relevant fields from API response"""
        
        flight_info = flight_data.get('flight', {}) or {}
        airline_info = flight_data.get('airline', {}) or {}
        departure_info = flight_data.get('departure', {}) or {}
        arrival_info = flight_data.get('arrival', {}) or {}
        
        # Parse scheduled times
        dep_scheduled = departure_info.get('scheduled', '')
        arr_scheduled = arrival_info.get('scheduled', '')
        
        try:
            dep_dt = pd.to_datetime(dep_scheduled) if dep_scheduled else None
            arr_dt = pd.to_datetime(arr_scheduled) if arr_scheduled else None
        except:
            dep_dt = None
            arr_dt = None
        
        # Extract time features
        if dep_dt:
            flight_month = dep_dt.month
            flight_year = dep_dt.year
            day_of_week = dep_dt.dayofweek
            week_of_year = dep_dt.isocalendar()[1]
            day_of_month = dep_dt.day
            quarter = (dep_dt.month - 1) // 3 + 1
            dep_hour = dep_dt.hour
            is_weekend = 1 if dep_dt.dayofweek >= 5 else 0
        else:
            flight_month = flight_year = day_of_week = week_of_year = 0
            day_of_month = quarter = dep_hour = is_weekend = 0
        
        if arr_dt:
            arr_hour = arr_dt.hour
        else:
            arr_hour = 0
        
        # Calculate season
        season = self._get_season(flight_month)
        
        # Get delay information
        dep_delay = departure_info.get('delay', 0) or 0
        arr_delay = arrival_info.get('delay')
        
        # Calculate elapsed time
        if dep_dt and arr_dt:
            crs_elapsed_time = (arr_dt - dep_dt).total_seconds() / 60  # minutes
        else:
            crs_elapsed_time = 0
        
        # Holiday detection (simplified - you may want to enhance this)
        is_holiday, is_near_holiday, is_holiday_period = self._detect_holidays(dep_dt)
        
        return {
            # Identifiers
            'flight_number': flight_info.get('number', ''),
            'flight_iata': flight_info.get('iata', ''),
            'airline_name': airline_info.get('name', ''),
            'airline_code': airline_info.get('iata', ''),
            'origin_airport_code': departure_info.get('iata', ''),
            'destination_airport_code': arrival_info.get('iata', ''),
            
            # Temporal features
            'flight_month': flight_month,
            'flight_year': flight_year,
            'day_of_week': day_of_week,
            'week_of_year': week_of_year,
            'day_of_month': day_of_month,
            'quarter': quarter,
            'season': season,
            'dep_hour': dep_hour,
            'arr_hour': arr_hour,
            
            # Boolean features
            'is_weekend': is_weekend,
            'is_holiday': is_holiday,
            'is_near_holiday': is_near_holiday,
            'is_holiday_period': is_holiday_period,
            
            # Numerical features
            'fl_number': int(flight_info.get('number', 0) or 0),
            'crs_elapsed_time': crs_elapsed_time,
            'distance': 0,  # Would need additional API or database lookup
            'dep_delay': float(dep_delay),
            'arr_delay': float(arr_delay) if arr_delay else None,
            
            # Metadata
            'flight_date': flight_data.get('flight_date', ''),
            'flight_status': flight_data.get('flight_status', ''),
            'scheduled_departure': dep_scheduled,
            'scheduled_arrival': arr_scheduled
        }
    
    def _get_season(self, month: int) -> str:
        """Map month to season"""
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Fall'
    
    def _detect_holidays(self, dt: Optional[pd.Timestamp]) -> Tuple[int, int, int]:
        """
        Simplified holiday detection
        Returns: (is_holiday, is_near_holiday, is_holiday_period)
        """
        if dt is None:
            return (0, 0, 0)
        
        # Major US holidays (simplified)
        holidays = [
            (1, 1),   # New Year's Day
            (7, 4),   # Independence Day
            (11, 11), # Veterans Day
            (12, 25), # Christmas
        ]
        
        month_day = (dt.month, dt.day)
        is_holiday = 1 if month_day in holidays else 0
        
        # Near holiday (within 3 days)
        is_near_holiday = 0
        for h_month, h_day in holidays:
            holiday_dt = pd.Timestamp(year=dt.year, month=h_month, day=h_day)
            days_diff = abs((dt - holiday_dt).days)
            if 0 < days_diff <= 3:
                is_near_holiday = 1
                break
        
        # Holiday period (Nov-Dec)
        is_holiday_period = 1 if dt.month in [11, 12] else 0
        
        return (is_holiday, is_near_holiday, is_holiday_period)
    
    def create_feature_vector(self, parsed_flight: Dict) -> np.ndarray:
        """
        Create feature vector matching Gold table structure (819 features)
        This is a simplified version - ideally you'd use the same pipeline
        """
        
        # Numerical features (in order from Gold table)
        numerical = [
            parsed_flight['flight_month'],
            parsed_flight['flight_year'],
            parsed_flight['day_of_week'],
            parsed_flight['week_of_year'],
            parsed_flight['day_of_month'],
            parsed_flight['quarter'],
            parsed_flight['fl_number'],
            parsed_flight['dep_hour'],
            parsed_flight['arr_hour'],
            parsed_flight['crs_elapsed_time'],
            parsed_flight['distance'],
            parsed_flight['dep_delay']
        ]
        
        # Boolean features
        boolean = [
            parsed_flight['is_weekend'],
            parsed_flight['is_holiday'],
            parsed_flight['is_near_holiday'],
            parsed_flight['is_holiday_period']
        ]
        
        # Combine numerical and boolean
        base_features = numerical + boolean  # 16 features
        
        # Categorical features would be one-hot encoded
        # For simplicity, we'll create a placeholder vector
        # In production, you'd need to use the same StringIndexer and OneHotEncoder
        # fitted on your Gold table
        
        # Total features = 16 (base) + ~803 (one-hot encoded categoricals) = 819
        categorical_placeholder = [0.0] * (self.total_features - len(base_features))
        
        full_vector = base_features + categorical_placeholder
        
        return np.array(full_vector, dtype=float)

print("‚úÖ Feature Engineer defined")

In [0]:
# Cell 5: Model Loader
class ModelLoader:
    """Load trained models from MLflow"""
    
    def __init__(self):
        mlflow.set_tracking_uri(APIConfig.MLFLOW_TRACKING_URI)
        mlflow.set_registry_uri(APIConfig.MLFLOW_REGISTRY_URI)
        
        self.models = {}
        self.model_info = {}
    
    def load_best_models(self, experiment_name: str):
        """Load the best performing models from the experiment"""
        
        print(f"\nüîç Searching for models in: {experiment_name}")
        
        try:
            experiment = mlflow.get_experiment_by_name(experiment_name)
            if not experiment:
                print(f"‚ùå Experiment not found: {experiment_name}")
                return False
            
            # Get all runs from the experiment
            runs = mlflow.search_runs(
                experiment_ids=[experiment.experiment_id],
                filter_string="",
                order_by=["metrics.auc_roc DESC"]
            )
            
            if runs.empty:
                print("‚ùå No runs found in experiment")
                return False
            
            print(f"‚úÖ Found {len(runs)} runs")
            
            # Load each model type
            model_types = {
                'RF_Pre': 'RF_Pre_Departure',
                'GBT_Pre': 'GBT_Pre_Departure',
                'RF_In': 'RF_In_Flight',
                'GBT_In': 'GBT_In_Flight'
            }
            
            for model_key, run_name_pattern in model_types.items():
                matching_runs = runs[runs['tags.mlflow.runName'].str.contains(run_name_pattern, na=False)]
                
                if not matching_runs.empty:
                    best_run = matching_runs.iloc[0]
                    run_id = best_run['run_id']
                    auc_roc = best_run['metrics.auc_roc']
                    
                    # Load model
                    try:
                        model_uri = f"runs:/{run_id}/model_{model_key.lower()}"
                        model = mlflow.spark.load_model(model_uri)
                        
                        self.models[model_key] = model
                        self.model_info[model_key] = {
                            'run_id': run_id,
                            'auc_roc': auc_roc,
                            'run_name': best_run['tags.mlflow.runName']
                        }
                        
                        print(f"‚úÖ Loaded {model_key}: AUC-ROC = {auc_roc:.4f}")
                    
                    except Exception as e:
                        print(f"‚ùå Error loading {model_key}: {e}")
            
            return len(self.models) > 0
        
        except Exception as e:
            print(f"‚ùå Error loading models: {e}")
            return False
    
    def get_model(self, model_type: str):
        """Get a specific model"""
        return self.models.get(model_type)
    
    def get_model_info(self, model_type: str) -> Dict:
        """Get model metadata"""
        return self.model_info.get(model_type, {})

print("‚úÖ Model Loader defined")

In [0]:
# Cell 6: Prediction Engine
class FlightDelayPredictor:
    """Make predictions using trained models"""
    
    def __init__(self, model_loader: ModelLoader, feature_engineer: FlightFeatureEngineer):
        self.model_loader = model_loader
        self.feature_engineer = feature_engineer
        self.selected_indices = APIConfig.SELECTED_INDICES
        self.dep_delay_index = APIConfig.DEP_DELAY_ORIGINAL_INDEX
    
    def _select_features(self, feature_vector: np.ndarray, remove_dep_delay: bool = False) -> np.ndarray:
        """
        Select top-K features based on feature selection from experiments
        Optionally remove dep_delay for pre-departure predictions
        """
        # Select features by indices
        selected = feature_vector[self.selected_indices]
        
        if remove_dep_delay:
            # Find dep_delay in selected features and remove it
            if self.dep_delay_index in self.selected_indices:
                dep_delay_new_index = self.selected_indices.index(self.dep_delay_index)
                selected = np.delete(selected, dep_delay_new_index)
        
        return selected
    
    def predict_single_flight(self, parsed_flight: Dict, use_dep_delay: bool = False) -> Dict:
        """
        Make prediction for a single flight
        
        Args:
            parsed_flight: Parsed flight data from API
            use_dep_delay: If True, use in-flight models (with dep_delay)
                          If False, use pre-departure models (without dep_delay)
        """
        
        # Create full feature vector
        full_vector = self.feature_engineer.create_feature_vector(parsed_flight)
        
        # Select top-K features
        if use_dep_delay:
            # In-flight prediction (use dep_delay)
            features = self._select_features(full_vector, remove_dep_delay=False)
            model_rf = self.model_loader.get_model('RF_In')
            model_gbt = self.model_loader.get_model('GBT_In')
            prediction_type = 'in_flight'
        else:
            # Pre-departure prediction (remove dep_delay)
            features = self._select_features(full_vector, remove_dep_delay=True)
            model_rf = self.model_loader.get_model('RF_Pre')
            model_gbt = self.model_loader.get_model('GBT_Pre')
            prediction_type = 'pre_departure'
        
        if model_rf is None or model_gbt is None:
            return {
                'error': f'Models not loaded for {prediction_type}',
                'prediction_type': prediction_type
            }
        
        # Create Spark DataFrame with feature vector
        features_vector = Vectors.dense(features.tolist())
        
        df = spark.createDataFrame(
            [(features_vector,)],
            schema=StructType([
                StructField("features", VectorUDT(), nullable=False)
            ])
        )
        
        # Make predictions
        try:
            pred_rf = model_rf.transform(df).select("prediction", "probability").first()
            pred_gbt = model_gbt.transform(df).select("prediction", "probability").first()
            
            # Extract probabilities
            prob_rf = float(pred_rf['probability'][1])  # Probability of delay
            prob_gbt = float(pred_gbt['probability'][1])
            
            # Ensemble prediction (average)
            ensemble_prob = (prob_rf + prob_gbt) / 2
            ensemble_prediction = 1 if ensemble_prob >= 0.5 else 0
            
            return {
                'flight_iata': parsed_flight['flight_iata'],
                'airline_name': parsed_flight['airline_name'],
                'origin': parsed_flight['origin_airport_code'],
                'destination': parsed_flight['destination_airport_code'],
                'scheduled_departure': parsed_flight['scheduled_departure'],
                'prediction_type': prediction_type,
                'rf_prediction': int(pred_rf['prediction']),
                'rf_delay_probability': prob_rf,
                'gbt_prediction': int(pred_gbt['prediction']),
                'gbt_delay_probability': prob_gbt,
                'ensemble_prediction': ensemble_prediction,
                'ensemble_delay_probability': ensemble_prob,
                'actual_dep_delay': parsed_flight.get('dep_delay'),
                'actual_arr_delay': parsed_flight.get('arr_delay'),
                'flight_status': parsed_flight['flight_status'],
                'prediction_timestamp': datetime.now().isoformat(),
                'model_rf_info': self.model_loader.get_model_info('RF_In' if use_dep_delay else 'RF_Pre'),
                'model_gbt_info': self.model_loader.get_model_info('GBT_In' if use_dep_delay else 'GBT_Pre')
            }
        
        except Exception as e:
            return {
                'error': f'Prediction failed: {str(e)}',
                'flight_iata': parsed_flight['flight_iata'],
                'prediction_type': prediction_type
            }
    
    def predict_multiple_flights(self, api_flights: List[Dict], use_dep_delay: bool = False) -> List[Dict]:
        """Make predictions for multiple flights"""
        
        predictions = []
        
        for flight_data in api_flights:
            parsed = self.feature_engineer.parse_api_flight(flight_data)
            prediction = self.predict_single_flight(parsed, use_dep_delay)
            predictions.append(prediction)
        
        return predictions

print("‚úÖ Prediction Engine defined")

In [0]:
# Cell 7: Results Storage
class PredictionStorage:
    """Store predictions to Delta Lake for dashboard consumption"""
    
    def __init__(self):
        self.table_name = APIConfig.PREDICTIONS_TABLE
        self.table_path = APIConfig.PREDICTIONS_PATH
    
    def save_predictions(self, predictions: List[Dict]) -> bool:
        """Save predictions to Delta table"""
        
        if not predictions:
            print("‚ö†Ô∏è No predictions to save")
            return False
        
        try:
            # Convert to DataFrame
            df = pd.DataFrame(predictions)
            
            # Convert to Spark DataFrame
            spark_df = spark.createDataFrame(df)
            
            # Add metadata
            spark_df = spark_df.withColumn("ingestion_timestamp", lit(datetime.now()))
            
            print(f"\nüíæ Saving {len(predictions)} predictions...")
            
            # Create directory if needed
            try:
                dbutils.fs.mkdirs(self.table_path.rsplit('/', 1)[0])
            except:
                pass
            
            # Write to Delta (append mode for dashboard)
            spark_df.write.format("delta").mode("append").save(self.table_path)
            
            # Register table if not exists
            try:
                spark.sql(f"""
                    CREATE TABLE IF NOT EXISTS {self.table_name}
                    USING DELTA
                    LOCATION '{self.table_path}'
                """)
            except Exception as e:
                print(f"‚ö†Ô∏è Table registration warning: {e}")
            
            print(f"‚úÖ Predictions saved to {self.table_name}")
            return True
        
        except Exception as e:
            print(f"‚ùå Error saving predictions: {e}")
            return False
    
    def get_recent_predictions(self, limit: int = 100):
        """Retrieve recent predictions for verification"""
        try:
            df = spark.table(self.table_name).orderBy(col("prediction_timestamp").desc()).limit(limit)
            return df
        except Exception as e:
            print(f"‚ùå Error reading predictions: {e}")
            return None

print("‚úÖ Prediction Storage defined")

In [0]:
# Cell 8: Main Prediction Pipeline
class FlightDelayPipeline:
    """Main orchestration class"""
    
    def __init__(self):
        print("\n" + "="*80)
        print("FLIGHT DELAY PREDICTION PIPELINE")
        print("="*80)
        
        # Initialize components
        self.api_client = AviationStackClient(APIConfig.API_KEY, APIConfig.BASE_URL)
        self.feature_engineer = FlightFeatureEngineer()
        self.model_loader = ModelLoader()
        self.storage = PredictionStorage()
        
        # Load models
        print("\nüì¶ Loading trained models...")
        success = self.model_loader.load_best_models(APIConfig.EXPERIMENT_NAME)
        
        if not success:
            print("‚ùå Failed to load models")
            return
        
        # Initialize predictor
        self.predictor = FlightDelayPredictor(self.model_loader, self.feature_engineer)
        
        print("\n‚úÖ Pipeline initialized successfully")
    
    def predict_flight_by_number(self, flight_number: str, date: str = None, 
                                  use_dep_delay: bool = False) -> Optional[Dict]:
        """
        Predict delay for a specific flight number
        
        Args:
            flight_number: Flight number (e.g., 'AA100')
            date: Date in YYYY-MM-DD format (default: today)
            use_dep_delay: Use in-flight model (True) or pre-departure model (False)
        """
        
        print(f"\nüîç Searching for flight: {flight_number}")
        
        # Get flight from API
        flight_data = self.api_client.get_flight_by_number(flight_number, date)
        
        if not flight_data:
            print(f"‚ùå Flight {flight_number} not found")
            return None
        
        # Parse and predict
        parsed = self.feature_engineer.parse_api_flight(flight_data)
        prediction = self.predictor.predict_single_flight(parsed, use_dep_delay)
        
        # Save to Delta
        self.storage.save_predictions([prediction])
        
        return prediction
    
    def predict_route(self, dep_iata: str, arr_iata: str, date: str = None,
                     use_dep_delay: bool = False) -> List[Dict]:
        """
        Predict delays for all flights on a route
        
        Args:
            dep_iata: Departure airport code
            arr_iata: Arrival airport code
            date: Date in YYYY-MM-DD format (default: today)
            use_dep_delay: Use in-flight model (True) or pre-departure model (False)
        """
        
        print(f"\nüîç Searching for flights: {dep_iata} ‚Üí {arr_iata}")
        
        # Get flights from API
        flights = self.api_client.get_route_flights(dep_iata, arr_iata, date)
        
        if not flights:
            print(f"‚ùå No flights found for route {dep_iata} ‚Üí {arr_iata}")
            return []
        
        # Make predictions
        predictions = self.predictor.predict_multiple_flights(flights, use_dep_delay)
        
        # Save to Delta
        self.storage.save_predictions(predictions)
        
        return predictions
    
    def display_prediction(self, prediction: Dict):
        """Pretty print a prediction result"""
        
        if 'error' in prediction:
            print(f"\n‚ùå {prediction['error']}")
            return
        
        print("\n" + "="*80)
        print("PREDICTION RESULTS")
        print("="*80)
        
        print(f"\n‚úàÔ∏è Flight Information:")
        print(f"   Flight: {prediction['flight_iata']}")
        print(f"   Airline: {prediction['airline_name']}")
        print(f"   Route: {prediction['origin']} ‚Üí {prediction['destination']}")
        print(f"   Scheduled: {prediction['scheduled_departure']}")
        print(f"   Status: {prediction['flight_status']}")
        
        print(f"\nü§ñ Prediction Type: {prediction['prediction_type'].upper()}")
        
        print(f"\nüìä Model Predictions:")
        print(f"   Random Forest:")
        print(f"      Delay Prediction: {'DELAYED' if prediction['rf_prediction'] == 1 else 'ON-TIME'}")
        print(f"      Delay Probability: {prediction['rf_delay_probability']*100:.1f}%")
        
        print(f"\n   Gradient Boosted Trees:")
        print(f"      Delay Prediction: {'DELAYED' if prediction['gbt_prediction'] == 1 else 'ON-TIME'}")
        print(f"      Delay Probability: {prediction['gbt_delay_probability']*100:.1f}%")
        
        print(f"\n   üéØ ENSEMBLE (Recommended):")
        print(f"      Delay Prediction: {'DELAYED' if prediction['ensemble_prediction'] == 1 else 'ON-TIME'}")
        print(f"      Delay Probability: {prediction['ensemble_delay_probability']*100:.1f}%")
        
        if prediction.get('actual_dep_delay') is not None:
            print(f"\nüìç Actual Departure Delay: {prediction['actual_dep_delay']:.0f} minutes")
        
        confidence = "HIGH" if abs(prediction['ensemble_delay_probability'] - 0.5) > 0.3 else "MEDIUM" if abs(prediction['ensemble_delay_probability'] - 0.5) > 0.15 else "LOW"
        print(f"\nüéöÔ∏è Confidence Level: {confidence}")
        
        print("\n" + "="*80)

print("‚úÖ Pipeline class defined")

In [0]:
# Cell 9: Initialize Pipeline
# Create pipeline instance
pipeline = FlightDelayPipeline()

In [0]:
# Cell 10: Example 1 - Predict Single Flight (Pre-Departure)
# Example: Predict a specific flight before departure

flight_number = "AA100"  # Replace with actual flight number
date = "2025-11-30"  # Replace with desired date

prediction = pipeline.predict_flight_by_number(
    flight_number=flight_number,
    date=date,
    use_dep_delay=False  # Pre-departure prediction
)

if prediction:
    pipeline.display_prediction(prediction)

In [0]:
# Cell 11: Example 2 - Predict Single Flight (In-Flight)
# Example: Predict with departure delay information (in-flight)

flight_number = "AA100"
date = "2025-11-30"

prediction = pipeline.predict_flight_by_number(
    flight_number=flight_number,
    date=date,
    use_dep_delay=True 
)

if prediction:
    pipeline.display_prediction(prediction)