In [0]:
# Load your clean Silver table
df_silver = spark.table("default.silver_flights_processed")

In [0]:
from pyspark.sql.functions import col, when, count, isnan

In [0]:
df_with_status = df_silver.withColumn(
    "arrival_status",
    when(col("arrival_delay").isNull(), "Cancelled/Diverted")
    .when(col("arrival_delay") >= 15, "Delayed")
    .when(col("arrival_delay") < 0, "Early")
    .otherwise("On-Time") # This covers 0 to 14 minutes
)

df_silver = df_with_status

In [0]:
# Show a sample of the new column
print("Sample of 'arrival_delay' and 'arrival_status':")
df_silver.select("arrival_delay", "arrival_status").show(20)

# See the breakdown of all categories
print("\nCounts for each category:")
df_silver.groupBy("arrival_status").count().show()

In [0]:
# Get all column names
all_columns = df_silver.columns

# Find just the float/double columns
numeric_cols = [
    c_name for (c_name, c_type) in df_silver.dtypes 
    if c_type in ('float', 'double')
]

# Get all *other* columns
other_cols = [
    c_name for c_name in all_columns 
    if c_name not in numeric_cols
]

# Create expressions for numeric columns (check for null OR nan)
numeric_expressions = [
    count(when(col(c).isNull() | isnan(c), c)).alias(c) 
    for c in numeric_cols
]

# Create expressions for all other columns (check for null only)
other_expressions = [
    count(when(col(c).isNull(), c)).alias(c) 
    for c in other_cols
]

# Combine the lists of expressions
all_expressions = numeric_expressions + other_expressions

# Run the counts and show the result
print("Missing value counts per column (before ML pipeline):")
df_silver.select(*all_expressions).show()