# Run Gold Layer Processing Only

This notebook creates fact and dimension tables from existing silver tables.

## 1. Install the Wheel Files (if not already installed)

In [None]:
# Install the wheel files (skip if already installed)
%pip install /lakehouse/default/Files/dist/unified_etl_core-1.0.0-py3-none-any.whl
%pip install /lakehouse/default/Files/dist/unified_etl_connectwise-1.0.0-py3-none-any.whl

from unified_etl_core.main import run_etl_pipeline
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

print("🥇 Starting Gold Layer Processing...")
print("This will create fact tables with surrogate keys and business metrics")

# Build table mappings for schema-enabled lakehouse
table_mappings = {
    "silver": silver_table_mapping,  # From previous cell
    "gold": {}
}

# Add gold table mappings
for entity_name in silver_table_mapping.keys():
    table_mappings["gold"][entity_name] = f"gold.gold_cw_{entity_name}"

# Create entity configurations for gold processing
# Fix the format - calculated_columns should be a dict, not a list
entity_configs = {
    "agreement": {
        "source": "connectwise",
        "surrogate_keys": [
            {"name": "AgreementSK", "business_keys": ["id"]}
        ],
        "business_keys": [
            {"name": "AgreementBusinessKey", "source_columns": ["id"]}
        ],
        "calculated_columns": {
            "estimated_monthly_revenue": "CASE WHEN applicationUnits = 'Amount' THEN COALESCE(applicationLimit, 0) ELSE 0 END"
        }
    }
}

# Add basic configs for other entities
for entity in silver_table_mapping.keys():
    if entity not in entity_configs:
        entity_configs[entity] = {
            "source": "connectwise",
            "surrogate_keys": [
                {"name": f"{entity.title()}SK", "business_keys": ["id"]}
            ],
            "business_keys": [
                {"name": f"{entity.title()}BusinessKey", "source_columns": ["id"]}
            ],
            "calculated_columns": {}  # Empty dict, not list
        }

config = {"entities": entity_configs}

print(f"Processing entities: {list(entity_configs.keys())}")

# Run Gold layer with proper table mappings
run_etl_pipeline(
    integrations=["connectwise"],
    layers=["gold"],
    config=config,
    table_mappings=table_mappings
)

In [None]:
# List silver tables in silver schema
silver_tables = spark.sql("SHOW TABLES IN silver").collect()
print(f"Found {len(silver_tables)} tables in silver schema:")

# Build silver table mapping and show counts
silver_table_mapping = {}
for row in silver_tables:
    table_name = row.tableName
    full_table_name = f"silver.{table_name}"
    count = spark.sql(f"SELECT COUNT(*) FROM {full_table_name}").collect()[0][0]
    print(f"  - {full_table_name}: {count:,} rows")
    
    # Extract entity name (e.g., silver_cw_agreement -> agreement)
    entity_name = table_name.replace("silver_cw_", "")
    silver_table_mapping[entity_name] = full_table_name

print(f"\nEntities to process: {list(silver_table_mapping.keys())}")

## 3. Run Gold Layer Processing

Create fact tables with business logic:

In [ ]:
from unified_etl_core.main import run_etl_pipeline
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

print("🥇 Starting Gold Layer Processing...")
print("This will create fact tables with business logic")

# Build table mappings for schema-enabled lakehouse
table_mappings = {
    "silver": silver_table_mapping,  # From previous cell
    "gold": {}
}

# Add gold table mappings - these will be used for the fact tables created
for entity_name in silver_table_mapping.keys():
    table_mappings["gold"][f"fact_{entity_name}"] = f"gold.gold_cw_fact_{entity_name}"
    
# Also add mappings for specialized fact tables
table_mappings["gold"]["fact_agreement_period"] = "gold.gold_cw_fact_agreement_period"
table_mappings["gold"]["fact_agreement_summary"] = "gold.gold_cw_fact_agreement_summary"
table_mappings["gold"]["fact_invoice_line"] = "gold.gold_cw_fact_invoice_line"
table_mappings["gold"]["fact_invoice_header"] = "gold.gold_cw_fact_invoice_header"
table_mappings["gold"]["fact_invoice_period"] = "gold.gold_cw_fact_invoice_period"

# Create entity configurations for gold processing with ConnectWise-specific settings
entity_configs = {
    "agreement": {
        "source": "connectwise",
        "surrogate_keys": [
            {"name": "AgreementSK", "business_keys": ["id"]}
        ],
        "business_keys": [
            {"name": "AgreementBusinessKey", "source_columns": ["id"]}
        ],
        "calculated_columns": {
            "estimated_monthly_revenue": "CASE WHEN applicationUnits = 'Amount' THEN COALESCE(applicationLimit, 0) ELSE 0 END"
        },
        # ConnectWise-specific transform settings
        "gold_transforms": {
            "fact_agreement_period": {
                "date_spine": {
                    "start": "2020-01-01",
                    "frequency": "month"
                },
                "metrics": [
                    "is_active_period",
                    "is_new_agreement", 
                    "is_churned_agreement",
                    "monthly_revenue",
                    "prorated_revenue",
                    "months_since_start",
                    "revenue_change",
                    "cumulative_revenue"
                ],
                "keys": [
                    {"name": "AgreementPeriodSK", "type": "hash", "source_columns": ["id", "period_start"]}
                ]
            },
            "fact_agreement_summary": {
                "metrics": [
                    "lifetime_days",
                    "lifetime_months",
                    "estimated_lifetime_value",
                    "actual_total_revenue",
                    "actual_avg_monthly_revenue",
                    "active_periods"
                ],
                "keys": [
                    {"name": "AgreementSummarySK", "type": "hash", "source_columns": ["id"]}
                ]
            }
        }
    },
    "invoice": {
        "source": "connectwise",
        "surrogate_keys": [
            {"name": "InvoiceSK", "business_keys": ["id"]}
        ],
        "business_keys": [
            {"name": "InvoiceBusinessKey", "source_columns": ["id"]}
        ],
        "calculated_columns": {},
        # ConnectWise-specific invoice transform settings
        "enable_period_facts": True,
        "period_type": "month"
    }
}

# Add basic configs for other entities without specialized transforms
for entity in silver_table_mapping.keys():
    if entity not in entity_configs:
        entity_configs[entity] = {
            "source": "connectwise",
            "surrogate_keys": [
                {"name": f"{entity.title()}SK", "business_keys": ["id"]}
            ],
            "business_keys": [
                {"name": f"{entity.title()}BusinessKey", "source_columns": ["id"]}
            ],
            "calculated_columns": {}
        }

config = {"entities": entity_configs}

print(f"Processing entities: {list(entity_configs.keys())}")
print("Will use ConnectWise-specific transforms for agreement and invoice entities")

# Run Gold layer with proper table mappings
run_etl_pipeline(
    integrations=["connectwise"],
    layers=["gold"],
    config=config,
    table_mappings=table_mappings
)

## 4. Verify Gold Tables

In [None]:
# Check gold tables
gold_tables = spark.sql("SHOW TABLES IN gold").collect()
print(f"Found {len(gold_tables)} tables in gold schema:")

for row in gold_tables:
    table_name = row.tableName
    full_table_name = f"gold.{table_name}"
    df = spark.sql(f"SELECT * FROM {full_table_name}")
    count = df.count()
    print(f"\n{full_table_name}:")
    print(f"  Rows: {count:,}")
    print(f"  Columns: {len(df.columns)}")
    
    # Show a few sample columns with data
    sample_cols = df.columns[:5]
    if "id" in df.columns and "id" not in sample_cols:
        sample_cols.append("id")
    if any("SK" in col for col in df.columns):
        sk_col = next(col for col in df.columns if "SK" in col)
        if sk_col not in sample_cols:
            sample_cols.append(sk_col)
    
    print(f"  Sample data ({', '.join(sample_cols)}):")
    df.select(*sample_cols).show(5, truncate=False)

## 5. Performance Summary

In [None]:
print("\n📊 Gold Layer Processing Summary")
print("=" * 50)

# Get row counts for each schema
for schema in ["bronze", "silver", "gold"]:
    tables = spark.sql(f"SHOW TABLES IN {schema}").collect()
    total_rows = 0
    table_details = []
    
    for row in tables:
        table_name = row.tableName
        full_name = f"{schema}.{table_name}"
        count = spark.sql(f"SELECT COUNT(*) FROM {full_name}").collect()[0][0]
        total_rows += count
        table_details.append((table_name, count))
    
    print(f"\n{schema.upper()} Schema:")
    print(f"  Tables: {len(tables)}")
    print(f"  Total rows: {total_rows:,}")
    for table, count in sorted(table_details):
        print(f"    - {table}: {count:,} rows")

print("\n✅ Gold layer complete - ready for reporting and analytics!")