In [0]:
df = spark.table("workspace.flight_delay.gold_flight_features")


# Flight Delay Prediction – ML Model Training

## Objective
Train a machine learning model to predict whether a flight will be delayed before departure.

## Problem Type
Binary classification

## Target Variable
is_delayed (1 = delayed, 0 = on-time)

## Success Metric
AUC (Area Under ROC Curve)


In [0]:
import os
import mlflow
import mlflow.spark

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ✅ REQUIRED for Serverless / UC
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/flight_delay/flight_delay_data/mlflow_tmp"

# Load data
df = spark.table("workspace.flight_delay.gold_flight_features")

categorical_cols = ["AIRLINE", "ORIGIN", "DEST"]
numeric_cols = [
    "DISTANCE", "dep_hour", "month", "day_of_week",
    "is_peak_hour",
    "DELAY_DUE_WEATHER", "DELAY_DUE_NAS",
    "DELAY_DUE_CARRIER", "DELAY_DUE_LATE_AIRCRAFT"
]

indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="skip")
    for c in categorical_cols
]

assembler = VectorAssembler(
    inputCols=[f"{c}_idx" for c in categorical_cols] + numeric_cols,
    outputCol="features"
)

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

mlflow.set_experiment("/Shared/FlightDelayPrediction")

with mlflow.start_run():

    lr = LogisticRegression(
        featuresCol="features",
        labelCol="is_delayed"
    )

    pipeline = Pipeline(stages=indexers + [assembler, lr])
    model = pipeline.fit(train_df)

    predictions = model.transform(test_df)

    evaluator = BinaryClassificationEvaluator(
        labelCol="is_delayed",
        metricName="areaUnderROC"
    )

    auc = evaluator.evaluate(predictions)

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("AUC", auc)

    mlflow.spark.log_model(model, "flight_delay_model")

    print("AUC:", auc)
