### üß© Prerequisites

1. In Databricks, open the **Catalog** panel on the left sidebar.  
2. Navigate to **Workspace ‚Üí default**.  
3. Click **Create** (top-right corner) and select **Volume**.  
4. Name the volume **`ds-capstone`** and click **Create**.  
5. Once created, open the new volume and click **Upload to this volume**.  
6. Upload the Kaggle dataset file: **`flights_sample_3m.csv`**.  

After completing these steps, you‚Äôll be ready to run the code below without any errors.  
If you encounter any issues, please reach out to me for assistance - Omar.


### BRONZE TABLE CREATION SCRIPT
#### Creates a Bronze Delta table from your raw data file
#### Automatically creates directories and tables if they don't exist

In [0]:
from pyspark.sql.functions import current_timestamp

# ===== CONFIGURATION =====
BASE_PATH = "/Volumes/workspace/default/ds_capstone" # Had to edit this line to the correct file path. Previously it was "/Volumes/workspace/default/ds-caspstone" and has since been corrected
PROJECT_NAME = "flights_analysis"
SOURCE_FILE_NAME = "flights_sample_3m.csv"

# Table configuration
DATABASE_NAME = "default"  # Change this if you want to use a different database
TABLE_NAME = "bronze_flights_data"
BRONZE_TABLE_NAME = f"{DATABASE_NAME}.{TABLE_NAME}"

# Define paths
SOURCE_FILE = f"{BASE_PATH}/{SOURCE_FILE_NAME}"
BRONZE_PATH = f"{BASE_PATH}/bronze/flights_data"

#### HELPER FUNCTIONS

In [0]:
def path_exists(path):
    """Check if a path exists"""
    try:
        dbutils.fs.ls(path)
        return True
    except:
        return False

def create_directory_if_not_exists(path):
    """Create directory if it doesn't exist"""
    if not path_exists(path):
        dbutils.fs.mkdirs(path)
        print(f"‚úÖ Created directory: {path}")
    else:
        print(f"‚ÑπÔ∏è  Directory already exists: {path}")

def table_exists(table_name):
    """Check if a table exists"""
    try:
        spark.table(table_name)
        return True
    except:
        return False

#### BRONZE LAYER CREATION

In [0]:
print("="*70)
print("ü•â BRONZE TABLE CREATION")
print("="*70)

# Step 1: List available files to help you verify the filename
print(f"\nüìÇ Available files in {BASE_PATH}:")
try:
    files = dbutils.fs.ls(BASE_PATH)
    for f in files:
        icon = "üìÅ" if f.isDir() else "üìÑ"
        print(f"  {icon} {f.name}")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not list files: {str(e)}")

# Step 2: Check if source file exists
print(f"\nüîç Checking for source file: {SOURCE_FILE}")
if not path_exists(SOURCE_FILE):
    print(f"‚ùå ERROR: Source file not found!")
    print(f"   Expected: {SOURCE_FILE}")
    print(f"\nüí° Tips:")
    print(f"   1. Check the filename spelling above")
    print(f"   2. Update SOURCE_FILE_NAME in the configuration section")
    print(f"   3. Make sure your file is uploaded to {BASE_PATH}")
    raise FileNotFoundError(f"Source file not found: {SOURCE_FILE}")
else:
    print(f"‚úÖ Source file found!")

# Step 3: Read the source file (auto-detect format)
print(f"\nüìñ Reading source file...")
try:
    if SOURCE_FILE.endswith('.csv'):
        df_raw = spark.read.csv(SOURCE_FILE, header=True, inferSchema=True)
        print("   Format: CSV")
    elif SOURCE_FILE.endswith('.json'):
        df_raw = spark.read.json(SOURCE_FILE)
        print("   Format: JSON")
    elif SOURCE_FILE.endswith('.parquet'):
        df_raw = spark.read.parquet(SOURCE_FILE)
        print("   Format: Parquet")
    elif SOURCE_FILE.endswith('.txt'):
        df_raw = spark.read.text(SOURCE_FILE)
        print("   Format: Text")
    else:
        # Try to read as Delta table
        df_raw = spark.read.format("delta").load(SOURCE_FILE)
        print("   Format: Delta")
    
    print(f"‚úÖ Successfully read file: {df_raw.count():,} records")
    
except Exception as e:
    print(f"‚ùå ERROR: Could not read file")
    print(f"   Error: {str(e)}")
    raise

# Step 4: Show data preview
print(f"\nüìä Data Preview (first 5 rows):")
display(df_raw.limit(5))

print(f"\nüìã Schema:")
df_raw.printSchema()

# Step 5: Add Bronze metadata (optional - tracks when data was ingested)
print(f"\nüè∑Ô∏è  Adding Bronze metadata...")
df_bronze = df_raw.withColumn("bronze_ingestion_timestamp", current_timestamp())
print(f"‚úÖ Added ingestion timestamp")

# Step 6: Check if Bronze path exists and clean if needed
print(f"\nüìÅ Checking Bronze path: {BRONZE_PATH}")
if path_exists(BRONZE_PATH):
    print(f"‚ö†Ô∏è  Path already exists. Checking if it's a valid Delta table...")
    try:
        # Try to read as Delta
        test_df = spark.read.format("delta").load(BRONZE_PATH)
        print(f"‚úÖ Valid Delta table found with {test_df.count()} records")
        print(f"üí° Will overwrite existing table")
    except:
        print(f"‚ö†Ô∏è  Path exists but is not a valid Delta table")
        print(f"üßπ Cleaning up old data...")
        dbutils.fs.rm(BRONZE_PATH, recurse=True)
        print(f"‚úÖ Old data removed")
else:
    print(f"‚úÖ Path is clear, ready to create new table")

# Create parent directory if needed
bronze_parent = "/".join(BRONZE_PATH.split("/")[:-1])
create_directory_if_not_exists(bronze_parent)

# Step 7: Write Bronze Delta table
print(f"\nüíæ Writing Bronze Delta table...")
try:
    df_bronze.write.format("delta").mode("overwrite").save(BRONZE_PATH)
    print(f"‚úÖ Delta table written to: {BRONZE_PATH}")
    print(f"‚úÖ Records written: {df_bronze.count():,}")
except Exception as e:
    print(f"‚ùå ERROR: Could not write Delta table")
    print(f"   Error: {str(e)}")
    print(f"\nüí° Trying to clean and retry...")
    try:
        dbutils.fs.rm(BRONZE_PATH, recurse=True)
        df_bronze.write.format("delta").mode("overwrite").save(BRONZE_PATH)
        print(f"‚úÖ Successfully wrote Delta table after cleanup")
    except Exception as e2:
        print(f"‚ùå Still failed: {str(e2)}")
        raise

# Step 8: Create table reference (register the Delta table)
print(f"\nüìå Registering Delta table as: {BRONZE_TABLE_NAME}")
try:
    # Ensure database exists
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
    print(f"‚úÖ Database '{DATABASE_NAME}' ready")
    
    # Drop table if it exists (to avoid conflicts)
    spark.sql(f"DROP TABLE IF EXISTS {BRONZE_TABLE_NAME}")
    print(f"   Dropped existing table (if any)")
    
    # Create external table pointing to the Delta location
    # Using a simple CREATE TABLE without location might work better in Community Edition
    df_for_table = spark.read.format("delta").load(BRONZE_PATH)
    df_for_table.write.format("delta").mode("overwrite").saveAsTable(BRONZE_TABLE_NAME)
    
    print(f"‚úÖ Table registered successfully as '{BRONZE_TABLE_NAME}'!")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not create table with saveAsTable, trying alternative method...")
    try:
        # Alternative: Create table with explicit LOCATION
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {BRONZE_TABLE_NAME}
            USING DELTA
            LOCATION '{BRONZE_PATH}'
        """)
        print(f"‚úÖ Table registered with LOCATION clause!")
    except Exception as e2:
        print(f"‚ö†Ô∏è  Table registration failed: {str(e2)}")
        print(f"üí° You can still access the data directly using:")
        print(f"   spark.read.format('delta').load('{BRONZE_PATH}')")

#### VERIFICATION

In [0]:
print("\n" + "="*70)
print("‚úÖ BRONZE DELTA TABLE CREATED!")
print("="*70)

# Verify by reading the Delta table directly
print(f"\nüîç Verification:")
try:
    verify_df = spark.read.format("delta").load(BRONZE_PATH)
    record_count = verify_df.count()
    print(f"‚úÖ Delta table exists at: {BRONZE_PATH}")
    print(f"‚úÖ Record count: {record_count:,}")
    
    # Show columns
    columns = verify_df.columns
    print(f"‚úÖ Columns ({len(columns)}): {', '.join(columns[:5])}{'...' if len(columns) > 5 else ''}")
    
    # Check if table name is registered
    if table_exists(BRONZE_TABLE_NAME):
        print(f"‚úÖ Table '{BRONZE_TABLE_NAME}' is registered and queryable")
    else:
        print(f"‚ö†Ô∏è  Table name '{BRONZE_TABLE_NAME}' not registered, but data is accessible via path")
        
except Exception as e:
    print(f"‚ùå Verification failed: {str(e)}")

### Options to call data from tables

In [0]:
## Uncomment to viewdata
# df = spark.read.format('delta').load('/Volumes/workspace/default/ds-caspstone/bronze/flights_data')
# display(df)

In [0]:
## Uncomment to viewdata
# display(
#     spark.table(
#         BRONZE_TABLE_NAME
#     )
# )

In [0]:
### Uncomment to viewdata
# display(
#     spark.sql(
#         f"SELECT * FROM {BRONZE_TABLE_NAME} LIMIT 10"
#     )
# )