# ü•à Silver Layer: Cleanse and Transform Flights Data

**Purpose:** This notebook takes the raw, unprocessed data from the Bronze table (`default.bronze_flights_data`) and transforms it into a clean, conformed, and enriched Silver table (`default.flights_processed`).

The goal of this Silver table is to be the "single source of truth" for analysts. It will be de-duplicated, have correct data types, and include new, engineered features to make analytics easier.

**Source Table:** `default.bronze_flights_data`
**Output Table:** `default.flights_processed`

In [0]:
from pyspark.sql.functions import col, to_date, month, year, isnan, when, count, upper, trim

In [0]:
df_bronze = spark.table("default.bronze_flights_data")

In [0]:
column_count = len(df_bronze.columns)

print(f"The bronze DataFrame has {column_count} columns.")

In [0]:
print("üìã Bronze Table Schema:")
df_bronze.printSchema()

In [0]:
# List of columns to drop, as you provided
columns_to_drop = [
    "AIRLINE_DOT", "DOT_CODE", "FL_NUMBER", "ORIGIN_CITY", "DEST_CITY", 
    "CRS_DEP_TIME", "DEP_TIME", "DEP_DELAY", "TAXI_OUT", "WHEELS_OFF", 
    "WHEELS_ON", "TAXI_IN", "CRS_ARR_TIME", "ARR_TIME", "CANCELLED", 
    "CANCELLATION_CODE", "DIVERTED", "CRS_ELAPSED_TIME", "ELAPSED_TIME", 
    "AIR_TIME", "DISTANCE", "DELAY_DUE_CARRIER", "DELAY_DUE_WEATHER", 
    "DELAY_DUE_NAS", "DELAY_DUE_SECURITY", "DELAY_DUE_LATE_AIRCRAFT","bronze_ingestion_timestamp"
]

df_silver = df_bronze.drop(*columns_to_drop)

# 1. Print the new schema to see what's left
print("üìã New Silver Table Schema (after dropping columns):")
df_silver.printSchema()

# 2. Show a sample of the new DataFrame
print("\nüîé Sample data from the new Silver Table:")
df_silver.show(5)


In [0]:
column_count = len(df_silver.columns)

print(f"The silver DataFrame has {column_count} columns.")

In [0]:
df_silver_transformed = df_silver.withColumn("flight_date", to_date(col("FL_DATE")))

# 2. Extract month and year into new columns
df_silver_transformed = df_silver_transformed.withColumn("flight_month", month(col("flight_date")))
df_silver_transformed = df_silver_transformed.withColumn("flight_year", year(col("flight_date")))

# 3. Drop the original string column
df_silver_transformed = df_silver_transformed.drop("FL_DATE")

# --- Let's verify the result ---
print("üìã New Silver Table Schema (with date columns):")
df_silver_transformed.printSchema()

df_silver = df_silver_transformed

In [0]:
all_columns = df_silver.columns
# Find just the float/double columns
numeric_cols = [
    c_name for (c_name, c_type) in df_silver.dtypes 
    if c_type in ('float', 'double')
]

# Get all *other* columns
other_cols = [
    c_name for c_name in all_columns 
    if c_name not in numeric_cols
]

# Create expressions for numeric columns (check for null OR nan)
numeric_expressions = [
    count(when(col(c).isNull() | isnan(c), c)).alias(c) 
    for c in numeric_cols
]

# Create expressions for all other columns (check for null only)
other_expressions = [
    count(when(col(c).isNull(), c)).alias(c) 
    for c in other_cols
]

# Combine the lists of expressions
all_expressions = numeric_expressions + other_expressions

# Run the counts and show the result
print("Missing value counts per column:")
df_silver.select(*all_expressions).show()

In [0]:
df_silver= df_silver.fillna(0, subset=["ARR_DELAY"])


In [0]:
df_silver_final = df_silver \
    .withColumnRenamed("AIRLINE", "airline_name") \
    .withColumnRenamed("AIRLINE_CODE", "airline_code") \
    .withColumn("airline_code", trim(upper(col("airline_code")))) \
    .withColumnRenamed("ORIGIN", "origin_airport_code") \
    .withColumn("origin_airport_code", trim(upper(col("origin_airport_code")))) \
    .withColumnRenamed("DEST", "destination_airport_code") \
    .withColumn("destination_airport_code", trim(upper(col("destination_airport_code"))))\
    .withColumnRenamed("ARR_DELAY", "arrival_delay") \

# Let's check the final schema and data
print("üìã Final Silver Table Schema:")
df_silver_final.printSchema()

print("\nüîé Final Silver Table Sample Data:")
df_silver_final.show(10)

# Assign this to df_silver for the final save
df_silver = df_silver_final

In [0]:
def path_exists(path):
    """Check if a path exists"""
    try:
        dbutils.fs.ls(path)
        return True
    except:
        return False

def create_directory_if_not_exists(path):
    """Create directory if it doesn't exist"""
    if not path_exists(path):
        dbutils.fs.mkdirs(path)
        print(f"‚úÖ Created directory: {path}")
    else:
        print(f"‚ÑπÔ∏è  Directory already exists: {path}")

def table_exists(table_name):
    """Check if a table exists"""
    try:
        spark.table(table_name)
        return True
    except:
        return False

In [0]:
assert df_silver, "The DataFrame 'df_silver' does not exist."

# Define the paths for your new Silver table
SILVER_PATH = "/Volumes/workspace/default/ds_capstone/silver/flights_processed"
SILVER_TABLE_NAME = "default.silver_flights_processed"
DATABASE_NAME = "default"


assert DATABASE_NAME, "DATABASE_NAME is not defined."

print(f"\nüìÅ Checking Silver path: {SILVER_PATH}")
if path_exists(SILVER_PATH):
    print(f"‚ö†Ô∏è  Path already exists. Checking if it's a valid Delta table...")
    try:
        # Try to read as Delta
        test_df = spark.read.format("delta").load(SILVER_PATH)
        print(f"‚úÖ Valid Delta table found with {test_df.count()} records")
        print(f"üí° Will overwrite existing table")
    except:
        print(f"‚ö†Ô∏è  Path exists but is not a valid Delta table")
        print(f"üßπ Cleaning up old data...")
        dbutils.fs.rm(SILVER_PATH, recurse=True)
        print(f"‚úÖ Old data removed")
else:
    print(f"‚úÖ Path is clear, ready to create new table")

# Create parent directory if needed
silver_parent = "/".join(SILVER_PATH.split("/")[:-1])
create_directory_if_not_exists(silver_parent)

print(f"\nüíæ Writing Silver Delta table...")
try:
    df_silver.write.format("delta").mode("overwrite").save(SILVER_PATH)
    print(f"‚úÖ Delta table written to: {SILVER_PATH}")
    print(f"‚úÖ Records written: {df_silver.count():,}")
except Exception as e:
    print(f"‚ùå ERROR: Could not write Delta table")
    print(f"   Error: {str(e)}")
    print(f"\nüí° Trying to clean and retry...")
    try:
        dbutils.fs.rm(SILVER_PATH, recurse=True)
        df_silver.write.format("delta").mode("overwrite").save(SILVER_PATH)
        print(f"‚úÖ Successfully wrote Delta table after cleanup")
    except Exception as e2:
        print(f"‚ùå Still failed: {str(e2)}")
        raise

print(f"\nüìå Registering Delta table as: {SILVER_TABLE_NAME}")
try:
    # Ensure database exists
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
    print(f"‚úÖ Database '{DATABASE_NAME}' ready")
    
    # Drop table if it exists (to avoid conflicts)
    spark.sql(f"DROP TABLE IF EXISTS {SILVER_TABLE_NAME}")
    print(f"   Dropped existing table (if any)")
    
    # Create managed table 
    # This reads the data you JUST wrote and saves it as a managed table
    df_for_table = spark.read.format("delta").load(SILVER_PATH)
    df_for_table.write.format("delta").mode("overwrite").saveAsTable(SILVER_TABLE_NAME)
    
    print(f"‚úÖ Table registered successfully as '{SILVER_TABLE_NAME}'!")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not create table with saveAsTable, trying alternative method...")
    try:
        # Alternative: Create external table with explicit LOCATION
        # This just points the table name to the files you saved in Step 7
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {SILVER_TABLE_NAME}
            USING DELTA
            LOCATION '{SILVER_PATH}'
        """)
        print(f"‚úÖ Table registered with LOCATION clause!")
    except Exception as e2:
        print(f"‚ö†Ô∏è  Table registration failed: {str(e2)}")
        print(f"üí° You can still access the data directly using:")
        print(f"   spark.read.format('delta').load('{SILVER_PATH}')")