# Test Unified ETL Pipeline - ConnectWise PSA
This notebook tests the unified ETL framework with all ConnectWise entities

## 1. Install the Unified ETL Packages

In [1]:

# Or install with pip
%pip install /lakehouse/default/Files/unified_etl_core-1.0.0-py3-none-any.whl
%pip install /lakehouse/default/Files/unified_etl_connectwise-1.0.0-py3-none-any.whl


StatementMeta(, c4fd79d6-91cd-45aa-820c-b388960c303b, 9, Finished, Available, Finished)

Processing /lakehouse/default/Files/unified_etl_core-1.0.0-py3-none-any.whl
Collecting delta-spark>=2.2.0 (from unified-etl-core==1.0.0)
  Downloading delta_spark-3.3.1-py3-none-any.whl.metadata (1.9 kB)
Collecting pydantic>=2.11.4 (from unified-etl-core==1.0.0)
  Downloading pydantic-2.11.5-py3-none-any.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting sparkdantic (from unified-etl-core==1.0.0)
  Downloading sparkdantic-2.4.0-py3-none-any.whl.metadata (7.6 kB)
Collecting pyspark>=3.3.0 (from unified-etl-core==1.0.0)
  Downloading pyspark-3.5.5.tar.gz (317.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.2/317.2 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting annotated-types>=0.6.0 (from pydantic>=2.11.4->unified-etl-core==1.0.0)
  Downloading annotated_types-0.7.0-py3-

## 2. Test Model Generation for All Entities

In [2]:
# Import all generated models
from unified_etl_connectwise.models import (
    Agreement,
    TimeEntry,
    ExpenseEntry,
    ProductItem,
    PostedInvoice,
    Invoice as UnpostedInvoice,  # UnpostedInvoice uses Invoice model
)
from unified_etl_connectwise.utils.api_utils import get_fields_for_api_call

# Test that all models work
model_mapping = {
    "Agreement": Agreement,
    "TimeEntry": TimeEntry,
    "ExpenseEntry": ExpenseEntry,
    "ProductItem": ProductItem,
    "PostedInvoice": PostedInvoice,
    "UnpostedInvoice": UnpostedInvoice,
}

print("Testing all ConnectWise models:")
print("=" * 50)

for entity_name, model_class in model_mapping.items():
    # Test field generation
    fields = get_fields_for_api_call(model_class, max_depth=2)
    field_count = len(fields.split(','))
    
    # Test Spark schema generation
    spark_schema = model_class.model_spark_schema()
    
    print(f"\n{entity_name}:")
    print(f"  - API fields: {field_count}")
    print(f"  - Spark schema fields: {len(spark_schema.fields)}")
    print(f"  - Sample fields: {', '.join(fields.split(',')[:5])}...")

StatementMeta(, c4fd79d6-91cd-45aa-820c-b388960c303b, 11, Finished, Available, Finished)

Testing all ConnectWise models:

Agreement:
  - API fields: 88
  - Spark schema fields: 82
  - Sample fields: agreementStatus, allowOverruns, applicationCycle, applicationLimit, applicationUnits...

TimeEntry:
  - API fields: 61
  - Spark schema fields: 62
  - Sample fields: activity, actualHours, addToDetailDescriptionFlag, addToInternalAnalysisFlag, addToResolutionFlag...

ExpenseEntry:
  - API fields: 32
  - Spark schema fields: 31
  - Sample fields: agreement, agreementAmount, amount, billAmount, billableOption...

ProductItem:
  - API fields: 80
  - Spark schema fields: 78
  - Sample fields: addComponentsFlag, agreement, agreementAmount, asioSubscriptionsID, billableOption...

PostedInvoice:
  - API fields: 80
  - Spark schema fields: 78
  - Sample fields: accountNumber, addToBatchEmailList, adjustedBy, adjustmentReason, agreement...

UnpostedInvoice:
  - API fields: 80
  - Spark schema fields: 78
  - Sample fields: accountNumber, addToBatchEmailList, adjustedBy, adjustmentReason,

## 3. Configure ConnectWise Connection

In [3]:
# Set credentials directly
import os

os.environ["CW_AUTH_USERNAME"] = "thekking+yemGyHDPdJ1hpuqx"
os.environ["CW_AUTH_PASSWORD"] = "yMqpe26Jcu55FbQk"
os.environ["CW_CLIENTID"] = "c7ea92d2-eaf5-4bfb-a09c-58d7f9dd7b81"
os.environ["CW_BASE_URL"] = "https://verk.thekking.is/v4_6_release/apis/3.0"

# Configure extractor (even though it doesn't use these values, it expects a config)
config = {
    "base_url": os.environ["CW_BASE_URL"],
    "auth": {
        "type": "api_key",
        "credentials": {
            "company": "thekking",
            "public_key": "yemGyHDPdJ1hpuqx",
            "private_key": "yMqpe26Jcu55FbQk",
            "client_id": "c7ea92d2-eaf5-4bfb-a09c-58d7f9dd7b81",
        }
    }
}

print(f"Configured for company: thekking")
print(f"Base URL: {os.environ['CW_BASE_URL']}")

StatementMeta(, c4fd79d6-91cd-45aa-820c-b388960c303b, 12, Finished, Available, Finished)

Configured for company: thekking
Base URL: https://verk.thekking.is/v4_6_release/apis/3.0


## 4. Test Bronze Layer Extraction for All Entities

In [4]:
from unified_etl_connectwise.extract import ConnectWiseExtractor
from datetime import datetime

# Create extractor
extractor = ConnectWiseExtractor(config)

# Define endpoints for each entity
entity_endpoints = {
    "Agreement": "/finance/agreements",
    "TimeEntry": "/time/entries",
    "ExpenseEntry": "/expense/entries",
    "ProductItem": "/procurement/products",
    "PostedInvoice": "/finance/invoices/posted",
    "UnpostedInvoice": "/finance/invoices",
}

# Extract a small sample from each entity
bronze_base_path = "/lakehouse/default/Tables/bronze"
extraction_results = {}

for entity_name, endpoint in entity_endpoints.items():
    print(f"\nExtracting {entity_name} from {endpoint}...")
    
    try:
        # Extract with small page size for testing
        df = extractor.extract(
            endpoint=endpoint,
            page_size=1000,  # Small sample
        )
        
        record_count = df.count()
        extraction_results[entity_name] = {
            "success": True,
            "count": record_count,
            "df": df
        }
        
        # Save to bronze
        bronze_path = f"{bronze_base_path}/bronze_cw_{entity_name.lower()}"
        df.write.mode("overwrite").format("delta").save(bronze_path)
        
        print(f"✅ Extracted {record_count} records")
        print(f"   Saved to: {bronze_path}")
        
    except Exception as e:
        extraction_results[entity_name] = {
            "success": False,
            "error": str(e)
        }
        print(f"❌ Failed: {str(e)}")

StatementMeta(, c4fd79d6-91cd-45aa-820c-b388960c303b, 13, Submitted, Running, Running)


Extracting Agreement from /finance/agreements...
❌ Failed: An error occurred while calling o6477.save.
: Operation failed: "Bad Request", 400, HEAD, http://onelake.dfs.fabric.microsoft.com/a3a23dd7-9f52-4b88-b056-46da3617c0b2/lakehouse/default/Tables/bronze/bronze_cw_agreement/_delta_log?upn=false&action=getStatus&timeout=90
	at org.apache.hadoop.fs.azurebfs.services.AbfsRestOperation.completeExecute(AbfsRestOperation.java:231)
	at org.apache.hadoop.fs.azurebfs.services.AbfsRestOperation.lambda$execute$0(AbfsRestOperation.java:191)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfInvocation(IOStatisticsBinding.java:464)
	at org.apache.hadoop.fs.azurebfs.services.AbfsRestOperation.execute(AbfsRestOperation.java:189)
	at org.apache.hadoop.fs.azurebfs.services.AbfsClient.getPathStatus(AbfsClient.java:779)
	at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.getFileStatus(AzureBlobFileSystemStore.java:1067)
	at org.apache.hadoop.fs.azurebfs.AzureBlobFileSy

## 5. Display Sample Data

In [None]:
# Show sample data from successful extractions
for entity_name, result in extraction_results.items():
    if result["success"] and result["count"] > 0:
        print(f"\n{entity_name} Sample (first 3 records):")
        print("=" * 80)
        result["df"].show(3, truncate=False)
        print("\nSchema:")
        result["df"].printSchema()

StatementMeta(, , -1, Waiting, , Waiting)

## 6. Test Silver Layer Validation

In [None]:
from unified_etl_core.extract.base import validate_batch

# Test validation for each entity type
validation_results = {}

for entity_name, result in extraction_results.items():
    if result["success"] and result["count"] > 0:
        print(f"\nValidating {entity_name}...")
        
        # Get sample data as list of dicts
        sample_data = result["df"].limit(5).toPandas().to_dict('records')
        
        # Get model class
        model_class = model_mapping[entity_name]
        
        # Validate
        valid_models, errors = validate_batch(sample_data, model_class)
        
        validation_results[entity_name] = {
            "total": len(sample_data),
            "valid": len(valid_models),
            "errors": len(errors)
        }
        
        print(f"✅ Valid: {len(valid_models)}/{len(sample_data)}")
        if errors:
            print(f"⚠️  Errors: {len(errors)}")
            print(f"   First error: {errors[0]['errors'][0] if errors else 'None'}")

StatementMeta(, , -1, Waiting, , Waiting)

## 7. Summary Report

In [None]:
print("🎉 Unified ETL Pipeline Test Summary")
print("=" * 50)
print("\nModel Generation:")
for entity in model_mapping.keys():
    print(f"  - {entity}: ✅")

print("\nBronze Layer Extraction:")
for entity, result in extraction_results.items():
    if result["success"]:
        print(f"  - {entity}: ✅ ({result['count']} records)")
    else:
        print(f"  - {entity}: ❌ ({result['error']})")

print("\nSilver Layer Validation:")
for entity, result in validation_results.items():
    print(f"  - {entity}: {result['valid']}/{result['total']} valid")

print("\nNext Steps:")
print("  1. Implement full Silver transformations (flattening, standardization)")
print("  2. Add Gold layer with business logic")
print("  3. Configure incremental processing")
print("  4. Add Business Central entities")

StatementMeta(, , -1, Waiting, , Waiting)