### 🧩 Prerequisites

1. In Databricks, open the **Catalog** panel on the left sidebar.  
2. Navigate to **Workspace → default**.  
3. Click **Create** (top-right corner) and select **Volume**.  
4. Name the volume **`ds-capstone`** and click **Create**.  
5. Once created, open the new volume and click **Upload to this volume**.  
6. Upload the Kaggle dataset file: **`flights_sample_3m.csv`**.  

After completing these steps, you’ll be ready to run the code below without any errors.  
If you encounter any issues, please reach out to me for assistance - Omar.


### BRONZE TABLE CREATION SCRIPT
#### Creates a Bronze Delta table from your raw data file
#### Automatically creates directories and tables if they don't exist

In [0]:
from pyspark.sql.functions import current_timestamp

# ===== CONFIGURATION =====
BASE_PATH = "/Volumes/workspace/default/ds-capstone"  # Had to edit this line to the correct file path. Previously it was "/Volumes/workspace/default/ds-caspstone" and has since been corrected. ENSURE FILE PATH FOR BASE DATA IS "/Volumes/workspace/default/ds-capstone" with a DASH and not an underscore as described in thee prerequisites above.
PROJECT_NAME = "flights_analysis"
SOURCE_FILE_NAME = "flights_sample_3m.csv"

# Table configuration
DATABASE_NAME = "default"  # Change this if you want to use a different database
TABLE_NAME = "bronze_flights_data"
BRONZE_TABLE_NAME = f"{DATABASE_NAME}.{TABLE_NAME}"

# Define paths
SOURCE_FILE = f"{BASE_PATH}/{SOURCE_FILE_NAME}"
BRONZE_PATH = f"{BASE_PATH}/bronze/flights_data"

#### HELPER FUNCTIONS

In [0]:
def path_exists(path):
    """Check if a path exists"""
    try:
        dbutils.fs.ls(path)
        return True
    except:
        return False


def create_directory_if_not_exists(path):
    """Create directory if it doesn't exist"""
    if not path_exists(path):
        dbutils.fs.mkdirs(path)
        print(f"✅ Created directory: {path}")
    else:
        print(f"ℹ️  Directory already exists: {path}")


def table_exists(table_name):
    """Check if a table exists"""
    try:
        spark.table(table_name)
        return True
    except:
        return False

#### BRONZE LAYER CREATION

In [0]:
print("=" * 70)
print("🥉 BRONZE TABLE CREATION")
print("=" * 70)

# Step 1: List available files to help you verify the filename
print(f"\n📂 Available files in {BASE_PATH}:")
try:
    files = dbutils.fs.ls(BASE_PATH)
    for f in files:
        icon = "📁" if f.isDir() else "📄"
        print(f"  {icon} {f.name}")
except Exception as e:
    print(f"⚠️  Could not list files: {str(e)}")

# Step 2: Check if source file exists
print(f"\n🔍 Checking for source file: {SOURCE_FILE}")
if not path_exists(SOURCE_FILE):
    print(f"❌ ERROR: Source file not found!")
    print(f"   Expected: {SOURCE_FILE}")
    print(f"\n💡 Tips:")
    print(f"   1. Check the filename spelling above")
    print(f"   2. Update SOURCE_FILE_NAME in the configuration section")
    print(f"   3. Make sure your file is uploaded to {BASE_PATH}")
    raise FileNotFoundError(f"Source file not found: {SOURCE_FILE}")
else:
    print(f"✅ Source file found!")

# Step 3: Read the source file (auto-detect format)
print(f"\n📖 Reading source file...")
try:
    if SOURCE_FILE.endswith(".csv"):
        df_raw = spark.read.csv(SOURCE_FILE, header=True, inferSchema=True)
        print("   Format: CSV")
    elif SOURCE_FILE.endswith(".json"):
        df_raw = spark.read.json(SOURCE_FILE)
        print("   Format: JSON")
    elif SOURCE_FILE.endswith(".parquet"):
        df_raw = spark.read.parquet(SOURCE_FILE)
        print("   Format: Parquet")
    elif SOURCE_FILE.endswith(".txt"):
        df_raw = spark.read.text(SOURCE_FILE)
        print("   Format: Text")
    else:
        # Try to read as Delta table
        df_raw = spark.read.format("delta").load(SOURCE_FILE)
        print("   Format: Delta")

    print(f"✅ Successfully read file: {df_raw.count():,} records")

except Exception as e:
    print(f"❌ ERROR: Could not read file")
    print(f"   Error: {str(e)}")
    raise

# Step 4: Show data preview
print(f"\n📊 Data Preview (first 5 rows):")
display(df_raw.limit(5))

print(f"\n📋 Schema:")
df_raw.printSchema()

# Step 5: Add Bronze metadata (optional - tracks when data was ingested)
print(f"\n🏷️  Adding Bronze metadata...")
df_bronze = df_raw.withColumn("bronze_ingestion_timestamp", current_timestamp())
print(f"✅ Added ingestion timestamp")

# Step 6: Check if Bronze path exists and clean if needed
print(f"\n📁 Checking Bronze path: {BRONZE_PATH}")
if path_exists(BRONZE_PATH):
    print(f"⚠️  Path already exists. Checking if it's a valid Delta table...")
    try:
        # Try to read as Delta
        test_df = spark.read.format("delta").load(BRONZE_PATH)
        print(f"✅ Valid Delta table found with {test_df.count()} records")
        print(f"💡 Will overwrite existing table")
    except:
        print(f"⚠️  Path exists but is not a valid Delta table")
        print(f"🧹 Cleaning up old data...")
        dbutils.fs.rm(BRONZE_PATH, recurse=True)
        print(f"✅ Old data removed")
else:
    print(f"✅ Path is clear, ready to create new table")

# Create parent directory if needed
bronze_parent = "/".join(BRONZE_PATH.split("/")[:-1])
create_directory_if_not_exists(bronze_parent)

# Step 7: Write Bronze Delta table
print(f"\n💾 Writing Bronze Delta table...")
try:
    df_bronze.write.format("delta").mode("overwrite").save(BRONZE_PATH)
    print(f"✅ Delta table written to: {BRONZE_PATH}")
    print(f"✅ Records written: {df_bronze.count():,}")
except Exception as e:
    print(f"❌ ERROR: Could not write Delta table")
    print(f"   Error: {str(e)}")
    print(f"\n💡 Trying to clean and retry...")
    try:
        dbutils.fs.rm(BRONZE_PATH, recurse=True)
        df_bronze.write.format("delta").mode("overwrite").save(BRONZE_PATH)
        print(f"✅ Successfully wrote Delta table after cleanup")
    except Exception as e2:
        print(f"❌ Still failed: {str(e2)}")
        raise

# Step 8: Create table reference (register the Delta table)
print(f"\n📌 Registering Delta table as: {BRONZE_TABLE_NAME}")
try:
    # Ensure database exists
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
    print(f"✅ Database '{DATABASE_NAME}' ready")

    # Drop table if it exists (to avoid conflicts)
    spark.sql(f"DROP TABLE IF EXISTS {BRONZE_TABLE_NAME}")
    print(f"   Dropped existing table (if any)")

    # Create external table pointing to the Delta location
    # Using a simple CREATE TABLE without location might work better in Community Edition
    df_for_table = spark.read.format("delta").load(BRONZE_PATH)
    df_for_table.write.format("delta").mode("overwrite").saveAsTable(BRONZE_TABLE_NAME)

    print(f"✅ Table registered successfully as '{BRONZE_TABLE_NAME}'!")
except Exception as e:
    print(f"⚠️  Could not create table with saveAsTable, trying alternative method...")
    try:
        # Alternative: Create table with explicit LOCATION
        spark.sql(
            f"""
            CREATE TABLE IF NOT EXISTS {BRONZE_TABLE_NAME}
            USING DELTA
            LOCATION '{BRONZE_PATH}'
        """
        )
        print(f"✅ Table registered with LOCATION clause!")
    except Exception as e2:
        print(f"⚠️  Table registration failed: {str(e2)}")
        print(f"💡 You can still access the data directly using:")
        print(f"   spark.read.format('delta').load('{BRONZE_PATH}')")

🥉 BRONZE TABLE CREATION

📂 Available files in /Volumes/workspace/default/ds-capstone:
  📄 flights_sample_3m.csv

🔍 Checking for source file: /Volumes/workspace/default/ds-capstone/flights_sample_3m.csv
✅ Source file found!

📖 Reading source file...
   Format: CSV
✅ Successfully read file: 3,000,000 records

📊 Data Preview (first 5 rows):


FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",1155,1151.0,-4.0,19.0,1210.0,1443.0,4.0,1501,1447.0,-14.0,0.0,,0.0,186.0,176.0,153.0,1065.0,,,,,
2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",2120,2114.0,-6.0,9.0,2123.0,2232.0,38.0,2315,2310.0,-5.0,0.0,,0.0,235.0,236.0,189.0,1399.0,,,,,
2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",954,1000.0,6.0,20.0,1020.0,1247.0,5.0,1252,1252.0,0.0,0.0,,0.0,118.0,112.0,87.0,680.0,,,,,
2023-03-06,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,2295,MSP,"Minneapolis, MN",SFO,"San Francisco, CA",1609,1608.0,-1.0,27.0,1635.0,1844.0,9.0,1829,1853.0,24.0,0.0,,0.0,260.0,285.0,249.0,1589.0,0.0,0.0,24.0,0.0,0.0
2020-02-23,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,407,MCO,"Orlando, FL",DFW,"Dallas/Fort Worth, TX",1840,1838.0,-2.0,15.0,1853.0,2026.0,14.0,2041,2040.0,-1.0,0.0,,0.0,181.0,182.0,153.0,985.0,,,,,



📋 Schema:
root
 |-- FL_DATE: date (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- AIRLINE_DOT: string (nullable = true)
 |-- AIRLINE_CODE: string (nullable = true)
 |-- DOT_CODE: integer (nullable = true)
 |-- FL_NUMBER: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: double (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- TAXI_OUT: double (nullable = true)
 |-- WHEELS_OFF: double (nullable = true)
 |-- WHEELS_ON: double (nullable = true)
 |-- TAXI_IN: double (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: double (nullable = true)
 |-- ARR_DELAY: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nulla

#### VERIFICATION

In [0]:
print("\n" + "=" * 70)
print("✅ BRONZE DELTA TABLE CREATED!")
print("=" * 70)

# Verify by reading the Delta table directly
print(f"\n🔍 Verification:")
try:
    verify_df = spark.read.format("delta").load(BRONZE_PATH)
    record_count = verify_df.count()
    print(f"✅ Delta table exists at: {BRONZE_PATH}")
    print(f"✅ Record count: {record_count:,}")

    # Show columns
    columns = verify_df.columns
    print(f"✅ Columns ({len(columns)}): {', '.join(columns[:5])}{'...' if len(columns) > 5 else ''}")

    # Check if table name is registered
    if table_exists(BRONZE_TABLE_NAME):
        print(f"✅ Table '{BRONZE_TABLE_NAME}' is registered and queryable")
    else:
        print(f"⚠️  Table name '{BRONZE_TABLE_NAME}' not registered, but data is accessible via path")

except Exception as e:
    print(f"❌ Verification failed: {str(e)}")


✅ BRONZE DELTA TABLE CREATED!

🔍 Verification:
✅ Delta table exists at: /Volumes/workspace/default/ds-capstone/bronze/flights_data
✅ Record count: 3,000,000
✅ Columns (33): FL_DATE, AIRLINE, AIRLINE_DOT, AIRLINE_CODE, DOT_CODE...
✅ Table 'default.bronze_flights_data' is registered and queryable


### Options to call data from tables

In [0]:
## Uncomment to viewdata
# df = spark.read.format('delta').load('/Volumes/workspace/default/ds-caspstone/bronze/flights_data')
# display(df)

In [0]:
# # Uncomment to viewdata
# display(
#     spark.table(
#         BRONZE_TABLE_NAME
#     )
# )

FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT,bronze_ingestion_timestamp
2019-05-03,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,1861,STL,"St. Louis, MO",TUL,"Tulsa, OK",2200,2359.0,119.0,10.0,9.0,105.0,4.0,2315,109.0,114.0,0.0,,0.0,75.0,70.0,56.0,351.0,114.0,0.0,0.0,0.0,0.0,2025-11-28T19:15:28.621Z
2022-04-04,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,805,MSP,"Minneapolis, MN",BWI,"Baltimore, MD",1815,1850.0,35.0,10.0,1900.0,2152.0,17.0,2135,2209.0,34.0,0.0,,0.0,140.0,139.0,112.0,936.0,0.0,0.0,0.0,0.0,34.0,2025-11-28T19:15:28.621Z
2022-04-25,PSA Airlines Inc.,PSA Airlines Inc.: OH,OH,20397,5260,FAY,"Fayetteville, NC",CLT,"Charlotte, NC",1303,1255.0,-8.0,16.0,1311.0,1350.0,6.0,1410,1356.0,-14.0,0.0,,0.0,67.0,61.0,39.0,118.0,,,,,,2025-11-28T19:15:28.621Z
2020-09-26,Envoy Air,Envoy Air: MQ,MQ,20398,3755,ORD,"Chicago, IL",FSD,"Sioux Falls, SD",1430,1422.0,-8.0,23.0,1445.0,1557.0,5.0,1612,1602.0,-10.0,0.0,,0.0,102.0,100.0,72.0,463.0,,,,,,2025-11-28T19:15:28.621Z
2022-07-13,Republic Airline,Republic Airline: YX,YX,20452,5857,BNA,"Nashville, TN",BOS,"Boston, MA",620,616.0,-4.0,14.0,630.0,935.0,9.0,958,944.0,-14.0,0.0,,0.0,158.0,148.0,125.0,942.0,,,,,,2025-11-28T19:15:28.621Z
2021-05-15,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,2580,LAS,"Las Vegas, NV",COS,"Colorado Springs, CO",1910,1916.0,6.0,11.0,1927.0,2145.0,4.0,2200,2149.0,-11.0,0.0,,0.0,110.0,93.0,78.0,604.0,,,,,,2025-11-28T19:15:28.621Z
2023-06-01,American Airlines Inc.,American Airlines Inc.: AA,AA,19805,2275,STL,"St. Louis, MO",PHL,"Philadelphia, PA",1435,1437.0,2.0,9.0,1446.0,1738.0,10.0,1744,1748.0,4.0,0.0,,0.0,129.0,131.0,112.0,814.0,,,,,,2025-11-28T19:15:28.621Z
2021-01-20,Horizon Air,Horizon Air: QX,QX,19687,2426,GEG,"Spokane, WA",BOI,"Boise, ID",1825,1813.0,-12.0,7.0,1820.0,2009.0,4.0,2035,2013.0,-22.0,0.0,,0.0,70.0,60.0,49.0,287.0,,,,,,2025-11-28T19:15:28.621Z
2020-04-03,Mesa Airlines Inc.,Mesa Airlines Inc.: YV,YV,20378,5837,PHX,"Phoenix, AZ",MRY,"Monterey, CA",2010,2000.0,-10.0,14.0,2014.0,2141.0,2.0,2204,2143.0,-21.0,0.0,,0.0,114.0,103.0,87.0,598.0,,,,,,2025-11-28T19:15:28.621Z
2019-01-22,American Airlines Inc.,American Airlines Inc.: AA,AA,19805,2817,DFW,"Dallas/Fort Worth, TX",ORD,"Chicago, IL",1215,,,,,,,1433,,,1.0,B,0.0,138.0,,,802.0,,,,,,2025-11-28T19:15:28.621Z


In [0]:
### Uncomment to viewdata
# display(
#     spark.sql(
#         f"SELECT * FROM {BRONZE_TABLE_NAME} LIMIT 10"
#     )
# )

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:440)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:470)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:768)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:510)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:616)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:643)
	at com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:80)
	at com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:348)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:59)
	at com.databricks.logging.AttributionContext$.withValue(Attr