# Gold Layer - Star Schema Creation

This notebook creates the final star schema with fact and dimension tables.

## Features:
- **Star Schema**: Dimensions and facts with proper joins
- **SCD2 Support**: Uses only current dimension records
- **Config-driven**: Uses `config/ingestion_config.json`
- **Control tables**: Audit log, watermark, error records
- **Multiple fact types**: Transactional, accumulating, snapshot

## Fact Types:
- **Transactional Facts**: Order transactions
- **Accumulating Facts**: Customer order summaries
- **Snapshot Facts**: Inventory snapshots

In [None]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os, json, uuid, datetime

spark = SparkSession.builder \
    .appName("Gold Layer - Star Schema") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

def now(): return datetime.datetime.now().isoformat()

In [None]:
# Load config
with open('../config/ingestion_config.json') as f:
    config = json.load(f)
gold_cfg = config['gold_layer']
control_cfg = config['control_tables']

SILVER_DATA_PATH = gold_cfg['silver_data_path']
GOLD_DATA_PATH = gold_cfg['gold_data_path']
os.makedirs(GOLD_DATA_PATH, exist_ok=True)

def log_audit(table, layer, status, row_count, error=None):
    run_id = str(uuid.uuid4())
    try:
        with open(control_cfg['audit_log'], 'a') as f:
            f.write(f'{run_id},{table},{layer},{now()},,{status},{row_count},{error or ""}\n')
        return run_id
    except Exception as e:
        print(f"Warning: Could not log audit: {e}")
        return None

def log_error(run_id, table, layer, error_type, error_message, record):
    try:
        with open(control_cfg['error_records'], 'a') as f:
            f.write(f'{run_id},{table},{layer},{error_type},{error_message},{json.dumps(record)},{now()}\n')
    except Exception as e:
        print(f"Warning: Could not log error: {e}")

def update_watermark(table, layer, key, date):
    try:
        with open(control_cfg['watermark'], 'a') as f:
            f.write(f'{table},{layer},{key},{date}\n')
    except Exception as e:
        print(f"Warning: Could not update watermark: {e}")

In [None]:
# Create dimension tables (only current records)
print("🔄 Creating dimension tables...")

for dim_name, dim_cfg in gold_cfg['dimensions'].items():
    try:
        print(f"📊 Processing dimension: {dim_name}")
        
        # Read from silver (SCD2)
        df_silver = spark.read.format('delta').load(f'{SILVER_DATA_PATH}/{dim_name}')
        
        # Filter only current records
        df_current = df_silver.filter(col('is_current') == True)
        
        # Select required columns
        columns = dim_cfg['columns']
        df_dim = df_current.select(columns)
        
        # Write to gold
        gold_dim_path = f'{GOLD_DATA_PATH}/{dim_name}'
        df_dim.write.mode('overwrite').format('delta').save(gold_dim_path)
        
        print(f"✅ {dim_name}: {df_dim.count()} rows")
        log_audit(dim_name, 'gold', 'SUCCESS', df_dim.count())
        
    except Exception as e:
        error_msg = str(e)
        print(f"❌ Error processing {dim_name}: {error_msg}")
        log_audit(dim_name, 'gold', 'FAIL', 0, error_msg)

print("✅ Dimensions created successfully")

In [None]:
# Create fact tables
print("\n🔄 Creating fact tables...")

for fact_name, fact_cfg in gold_cfg['facts'].items():
    try:
        print(f"📊 Processing fact: {fact_name}")
        
        # Read source table from silver
        source_table = fact_cfg['source_table']
        df_source = spark.read.format('delta').load(f'{SILVER_DATA_PATH}/{source_table}')
        
        # Apply joins if specified
        if 'joins' in fact_cfg:
            for join_config in fact_cfg['joins']:
                dim_table = join_config['table']
                join_key = join_config['on']
                join_type = join_config['type']
                
                # Read dimension
                df_dim = spark.read.format('delta').load(f'{GOLD_DATA_PATH}/{dim_table}')
                
                # Perform join
                df_source = df_source.join(df_dim, join_key, join_type)
                print(f"🔗 Joined with {dim_table}")
        
        # Apply aggregations if specified
        if 'aggregations' in fact_cfg:
            agg_config = fact_cfg['aggregations']
            group_cols = agg_config['group_by']
            metrics = agg_config['metrics']
            
            # Build aggregation expressions
            agg_exprs = []
            for metric_name, metric_expr in metrics.items():
                agg_exprs.append(f"{metric_expr} as {metric_name}")
            
            # Group by and aggregate
            df_source = df_source.groupBy(group_cols).agg(*[expr(agg_expr) for agg_expr in agg_exprs])
            print(f"📈 Applied aggregations")
        
        # Select final columns
        if 'columns' in fact_cfg:
            columns = fact_cfg['columns']
            df_fact = df_source.select(columns)
        else:
            df_fact = df_source
        
        # Write to gold
        gold_fact_path = f'{GOLD_DATA_PATH}/{fact_name}'
        df_fact.write.mode('overwrite').format('delta').save(gold_fact_path)
        
        print(f"✅ {fact_name}: {df_fact.count()} rows")
        log_audit(fact_name, 'gold', 'SUCCESS', df_fact.count())
        
    except Exception as e:
        error_msg = str(e)
        print(f"❌ Error processing {fact_name}: {error_msg}")
        log_audit(fact_name, 'gold', 'FAIL', 0, error_msg)

print("✅ Facts created successfully")

In [None]:
# Create analytical views
print("\n🔄 Creating analytical views...")

if 'analytical_views' in gold_cfg:
    for view_name, view_config in gold_cfg['analytical_views'].items():
        try:
            print(f"📊 Creating view: {view_name}")
            
            # Execute the query
            query = view_config['query']
            df_view = spark.sql(query)
            
            # Write to gold
            gold_view_path = f'{GOLD_DATA_PATH}/{view_name}'
            df_view.write.mode('overwrite').format('delta').save(gold_view_path)
            
            print(f"✅ {view_name}: {df_view.count()} rows")
            log_audit(view_name, 'gold', 'SUCCESS', df_view.count())
            
        except Exception as e:
            error_msg = str(e)
            print(f"❌ Error creating {view_name}: {error_msg}")
            log_audit(view_name, 'gold', 'FAIL', 0, error_msg)

print("\n🎉 Gold layer star schema creation completed!")