# Unified ETL Pipeline - Simple Version

This notebook uses the unified ETL framework as designed, without reimplementing existing functionality.

## 1. Install Dependencies

In [None]:
# Install the wheel files
%pip install /lakehouse/default/Files/unified_etl_core-1.0.0-py3-none-any.whl
%pip install /lakehouse/default/Files/unified_etl_connectwise-1.0.0-py3-none-any.whl

## 2. Set Credentials

In [None]:
import os

# Set ConnectWise credentials
os.environ["CW_AUTH_USERNAME"] = "thekking+yemGyHDPdJ1hpuqx"
os.environ["CW_AUTH_PASSWORD"] = "yMqpe26Jcu55FbQk"
os.environ["CW_CLIENTID"] = "c7ea92d2-eaf5-4bfb-a09c-58d7f9dd7b81"

print("✅ Credentials configured")

## 3. Run Full Pipeline

In [None]:
from unified_etl_core.main import run_etl_pipeline
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

# Run full pipeline
print("🚀 Starting Full ETL Pipeline...")

run_etl_pipeline(integrations=["connectwise"], layers=["bronze", "silver", "gold"], config={})

print("\n✅ Pipeline complete!")

## 4. Alternative: Run Specific Layers

In [None]:
# Run only Bronze layer
run_etl_pipeline(integrations=["connectwise"], layers=["bronze"], config={})

In [None]:
# Run only Silver layer (assuming Bronze exists)
run_etl_pipeline(integrations=["connectwise"], layers=["silver"], config={})

In [None]:
# Run only Gold layer (assuming Silver exists)
run_etl_pipeline(integrations=["connectwise"], layers=["gold"], config={})

## 5. Check Results

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.getActiveSession()

# Check table counts
for schema in ["bronze", "silver", "gold"]:
    print(f"\n{schema.upper()} Tables:")
    try:
        tables = spark.sql(f"SHOW TABLES IN {schema}").collect()
        for row in tables:
            table_name = row.tableName
            count = spark.sql(f"SELECT COUNT(*) FROM {schema}.{table_name}").collect()[0][0]
            print(f"  {table_name}: {count:,} rows")
    except:
        print(f"  Schema not found")

## 6. Generate Dimensions

In [None]:
from unified_etl_core.date_utils import generate_date_dimension
from unified_etl_core.dimensions import create_dimension_from_column

# Generate date dimension
if not spark.catalog.tableExists("gold.gold_dim_date"):
    date_dim = generate_date_dimension(
        spark=spark,
        start_date="2020-01-01",
        end_date="2030-12-31",
        fiscal_year_start_month=7,
    )
    date_dim.write.mode("overwrite").saveAsTable("gold.gold_dim_date")
    print(f"✅ Created date dimension")

# Generate other dimensions
dimension_configs = [
    ("silver_cw_timeentry", "billableOption", "dim_billable_option"),
    ("silver_cw_timeentry", "status", "dim_time_status"),
    ("silver_cw_agreement", "agreementStatus", "dim_agreement_status"),
    ("silver_cw_invoice", "statusName", "dim_invoice_status"),
]

for source_table, column, dim_name in dimension_configs:
    try:
        dim_df = create_dimension_from_column(
            spark=spark, source_table=source_table, column_name=column, dimension_name=dim_name
        )
        dim_df.write.mode("overwrite").saveAsTable(f"gold.{dim_name}")
        print(f"✅ Created {dim_name}")
    except Exception as e:
        print(f"❌ Failed to create {dim_name}: {str(e)}")

## 7. Quick Data Quality Check

In [None]:
# Check uninvoiced revenue
if spark.catalog.tableExists("gold.gold_fact_invoice_line"):
    uninvoiced = spark.sql("""
        SELECT 
            COUNT(*) as uninvoiced_lines,
            SUM(lineAmount) as uninvoiced_amount
        FROM gold.gold_fact_invoice_line
        WHERE invoiceId IS NULL
    """).collect()[0]

    print(f"\n💰 Uninvoiced Work:")
    print(f"  Lines: {uninvoiced['uninvoiced_lines']:,}")
    if uninvoiced["uninvoiced_amount"]:
        print(f"  Amount: ${uninvoiced['uninvoiced_amount']:,.2f}")

# Check time entry breakdown
if spark.catalog.tableExists("gold.gold_fact_time_entry"):
    breakdown = spark.sql("""
        SELECT 
            utilizationType,
            COUNT(*) as entries,
            SUM(actualHours) as total_hours
        FROM gold.gold_fact_time_entry
        GROUP BY utilizationType
        ORDER BY total_hours DESC
    """).collect()

    print("\n⏰ Time Entry Breakdown:")
    for row in breakdown:
        print(
            f"  {row['utilizationType']}: {row['total_hours']:,.1f} hours ({row['entries']:,} entries)"
        )