# Run Silver Layer Processing Only

This notebook processes existing bronze tables to silver with schema transformations.

## 1. Install the Wheel Files

First, upload the wheel files to your Lakehouse Files directory, then install them:

In [None]:
# Install the wheel files
%pip install /lakehouse/default/Files/dist/unified_etl_core-1.0.0-py3-none-any.whl
%pip install /lakehouse/default/Files/dist/unified_etl_connectwise-1.0.0-py3-none-any.whl

## 2. Check Existing Bronze Tables

Let's see what bronze tables we have with data:

In [ ]:
# List bronze tables
bronze_tables = spark.sql("SHOW TABLES LIKE 'bronze_cw_*'")
bronze_tables.show()

# Check row counts
print("\nBronze Table Row Counts:")
print("=" * 50)
for row in bronze_tables.collect():
    table_name = row.tableName
    count = spark.table(table_name).count()
    print(f"{table_name}: {count:,} rows")

In [ ]:
# Alternative: Automatically discover table mappings
def discover_table_mappings(spark, prefix_pattern="bronze_cw_"):
    """Discover existing tables and create mappings."""
    tables = spark.sql(f"SHOW TABLES LIKE '{prefix_pattern}*'").collect()
    
    mappings = {"bronze": {}, "silver": {}, "gold": {}}
    
    for row in tables:
        table_name = row.tableName
        # Extract entity name from table name
        if table_name.startswith(prefix_pattern):
            entity_name = table_name.replace(prefix_pattern, "")
            mappings["bronze"][entity_name] = table_name
            # Assume silver and gold follow same pattern
            mappings["silver"][entity_name] = f"silver_cw_{entity_name}"
            mappings["gold"][entity_name] = f"gold_cw_{entity_name}"
    
    return mappings

# Discover existing tables
discovered_mappings = discover_table_mappings(spark)
print("Discovered table mappings:")
for layer, entities in discovered_mappings.items():
    if entities:
        print(f"\n{layer.upper()}:")
        for entity, table in entities.items():
            print(f"  {entity} -> {table}")

# Alternative: Automatically discover table mappings with full paths
def discover_table_mappings(spark, prefix_pattern="bronze_cw_"):
    """Discover existing tables and create mappings with full Lakehouse paths."""
    tables = spark.sql(f"SHOW TABLES LIKE '{prefix_pattern}*'").collect()
    
    mappings = {"bronze": {}, "silver": {}, "gold": {}}
    
    for row in tables:
        table_name = row.tableName
        # Extract entity name from table name
        if table_name.startswith(prefix_pattern):
            entity_name = table_name.replace(prefix_pattern, "")
            # Use fully qualified names
            mappings["bronze"][entity_name] = f"Lakehouse.bronze.{table_name}"
            mappings["silver"][entity_name] = f"Lakehouse.silver.silver_cw_{entity_name}"
            mappings["gold"][entity_name] = f"Lakehouse.gold.gold_cw_{entity_name}"
    
    return mappings

# Discover existing tables
discovered_mappings = discover_table_mappings(spark)
print("Discovered table mappings:")
for layer, entities in discovered_mappings.items():
    if entities:
        print(f"\n{layer.upper()}:")
        for entity, table in entities.items():
            print(f"  {entity} -> {table}")

In [ ]:
from unified_etl_core.main import run_etl_pipeline
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Discover existing bronze tables in schema-enabled lakehouse
print("🔍 Discovering tables in schema-enabled lakehouse...")

# For schema-enabled lakehouses, we need to use the proper namespace
# Format: workspace.lakehouse.schema.table

# Try listing tables in the bronze schema
print("\n1. Listing tables in bronze schema:")
try:
    bronze_tables = spark.sql("SHOW TABLES IN bronze").collect()
    print(f"   Found {len(bronze_tables)} tables in bronze schema")
    for row in bronze_tables:
        print(f"   - {row.tableName}")
except Exception as e:
    print(f"   Error: {str(e)[:200]}...")

# Alternative: List all tables and filter
print("\n2. Listing all tables in current database:")
all_tables = spark.sql("SHOW TABLES").collect()
print(f"   Found {len(all_tables)} total tables")

# Build table mappings dynamically
table_mappings = {"bronze": {}, "silver": {}, "gold": {}}

# For schema-enabled lakehouse, tables are referenced as schema.table
print("\n3. Checking bronze schema tables:")
# Use camelCase entity names that match the actual table names
bronze_entity_names = ["agreement", "invoice", "timeentry", "expenseentry", "productitem"]

for entity in bronze_entity_names:
    # Try different table name formats
    table_candidates = [
        f"bronze.bronze_cw_{entity}",  # schema.table format
        f"bronze_cw_{entity}",          # just table name
        f"`bronze`.`bronze_cw_{entity}`"  # quoted format
    ]
    
    for table_name in table_candidates:
        try:
            count = spark.sql(f"SELECT COUNT(*) FROM {table_name}").collect()[0][0]
            print(f"   ✓ Found {entity}: {table_name} ({count:,} rows)")
            table_mappings["bronze"][entity] = table_name
            table_mappings["silver"][entity] = f"silver.silver_cw_{entity}"
            table_mappings["gold"][entity] = f"gold.gold_cw_{entity}"
            break
        except:
            continue

print(f"\n✅ Discovered {len(table_mappings['bronze'])} bronze tables")
print(f"Table mappings: {table_mappings}")

if table_mappings["bronze"]:
    print("\n🚀 Starting Silver Layer Processing...")
    run_etl_pipeline(
        integrations=["connectwise"],
        layers=["silver"],
        config={},
        table_mappings=table_mappings
    )
else:
    print("\n❌ No bronze tables found. Check schema configuration.")

## 4. Verify Silver Tables

In [ ]:
# Check silver tables
silver_tables = spark.sql("SHOW TABLES LIKE 'silver_cw_*'")
silver_tables.show()

# Check row counts and sample data
print("\nSilver Table Summary:")
print("=" * 50)
for row in silver_tables.collect():
    table_name = row.tableName
    df = spark.table(table_name)
    count = df.count()
    print(f"\n{table_name}: {count:,} rows")
    print(f"Columns: {len(df.columns)}")
    print(f"Sample columns: {', '.join(df.columns[:5])}...")
    
    # Show sample data
    print("\nSample data:")
    df.show(5, truncate=True)

## 5. Performance Summary

In [ ]:
print("\n📊 Silver Layer Processing Summary")
print("=" * 50)

# Get all tables and their sizes
all_tables = spark.sql("SHOW TABLES")
table_sizes = {}

for row in all_tables.collect():
    table = row.tableName
    if table.startswith(('bronze_cw_', 'silver_cw_')):
        try:
            count = spark.table(table).count()
            table_sizes[table] = count
        except:
            pass

# Group by layer
bronze = {k: v for k, v in table_sizes.items() if k.startswith('bronze_cw_')}
silver = {k: v for k, v in table_sizes.items() if k.startswith('silver_cw_')}

print(f"Bronze Tables: {len(bronze)} tables, {sum(bronze.values()):,} total rows")
print(f"Silver Tables: {len(silver)} tables, {sum(silver.values()):,} total rows")

print("\n✅ Silver processing completed successfully!")
print("\nKey improvements in this version:")
print("- No collect() or row-by-row processing")
print("- Distributed Spark operations throughout")
print("- Scalable to millions of rows")
print("\nNext step: Run Gold layer processing in a separate notebook")