# PyForge Databricks Integration - Volume Operations & Serverless Optimization
## Complete Walkthrough with Databricks SDK Integration

This notebook demonstrates how to use PyForge with Databricks Volumes, showcasing:
- **Databricks SDK Integration**: Direct Volume file operations
- **Serverless Environment Detection**: Automatic optimization for serverless compute
- **Volume-to-Volume Processing**: Direct conversion without local downloads
- **Format-Specific Routing**: Native Databricks processing for supported formats

### Prerequisites:
- Running in Databricks environment (workspace or serverless)
- Unity Catalog enabled with Volume access
- PyForge packages installed: `pyforge-core` and `pyforge-databricks`

## 1. Installation and Environment Setup

In [None]:
# Install PyForge packages
%pip install pyforge-core pyforge-databricks --quiet

# Restart Python kernel to ensure clean imports
# Note: In serverless, packages are installed at the environment level
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
# Import required libraries
import os
import sys
from datetime import datetime
from pathlib import Path

# Databricks SDK imports
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog

# PyForge imports
from pyforge_core import PyForgeCore
from pyforge_databricks import PyForgeDatabricks

# Spark imports (pre-installed in Databricks)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp

print("🚀 PyForge Databricks Integration loaded successfully!")
print(f"📍 Python version: {sys.version}")
print(f"📦 Running in: {os.environ.get('DATABRICKS_RUNTIME_VERSION', 'Local environment')}")

## 2. Environment Detection and Validation

In [None]:
# Initialize PyForge with Databricks integration
forge = PyForgeDatabricks()

# Display detailed environment information
env_info = forge.env.get_environment_info()

print("🔍 Environment Detection Results:")
print("=" * 50)
print(f"📊 Databricks Environment: {env_info['is_databricks']}")
print(f"⚡ Serverless Compute: {env_info['is_serverless']}")
print(f"🔢 Environment Version: {env_info['environment_version']}")
print(f"🐍 Python Version: {env_info['python_version']}")
print(f"✨ Spark Version: {env_info['spark_version']}")
print(f"🏢 Workspace URL: {env_info['workspace_url']}")

# Validate SDK connection
print("\n🔌 Validating Databricks SDK connection...")
try:
    current_user = forge.w.current_user.me()
    print(f"✅ Connected as: {current_user.display_name}")
    print(f"📧 Email: {current_user.user_name}")
except Exception as e:
    print(f"❌ SDK connection failed: {str(e)}")

## 3. Volume Configuration and Access

In [None]:
# Configure Volume paths
# Update these to match your Unity Catalog setup
CATALOG = "main"  # or your catalog name
SCHEMA = "default"  # or your schema name
BRONZE_VOLUME = "bronze"  # raw data volume
SILVER_VOLUME = "silver"  # processed data volume

# Construct Volume paths
bronze_path = f"/Volumes/{CATALOG}/{SCHEMA}/{BRONZE_VOLUME}"
silver_path = f"/Volumes/{CATALOG}/{SCHEMA}/{SILVER_VOLUME}"

print("📁 Volume Configuration:")
print(f"   Bronze (Raw): {bronze_path}")
print(f"   Silver (Processed): {silver_path}")

# List available volumes
print("\n📂 Available Volumes in Catalog:")
try:
    volumes = forge.w.volumes.list(catalog_name=CATALOG, schema_name=SCHEMA)
    for vol in volumes:
        print(f"   - {vol.name} ({vol.volume_type}): {vol.full_name}")
except Exception as e:
    print(f"   ⚠️ Could not list volumes: {str(e)}")
    print("   Please ensure you have access to Unity Catalog volumes")

## 4. Volume File Operations with Databricks SDK

In [None]:
# Demonstrate Volume file operations
test_file_path = f"{bronze_path}/test_data.csv"

# Create sample data
sample_csv_content = """id,name,value,date
1,Alice,100.5,2024-01-01
2,Bob,200.3,2024-01-02
3,Charlie,150.7,2024-01-03
4,Diana,300.1,2024-01-04
5,Eve,250.9,2024-01-05
"""

# Write file to Volume
print("📝 Writing sample file to Volume...")
try:
    forge.volume_handler.write_file(test_file_path, sample_csv_content.encode('utf-8'))
    print(f"✅ File written to: {test_file_path}")
except Exception as e:
    print(f"❌ Write failed: {str(e)}")

# List files in bronze volume
print("\n📋 Files in Bronze Volume:")
try:
    files = forge.volume_handler.list_files(bronze_path)
    for file in files[:10]:  # Show first 10 files
        print(f"   - {file.path} ({file.file_size} bytes)")
except Exception as e:
    print(f"   ⚠️ Could not list files: {str(e)}")

# Read file from Volume
print("\n📖 Reading file from Volume...")
try:
    content = forge.volume_handler.read_file(test_file_path)
    print(f"✅ File content (first 200 chars):")
    print(content.decode('utf-8')[:200])
except Exception as e:
    print(f"❌ Read failed: {str(e)}")

## 5. Format-Specific Processing with Serverless Optimization

In [None]:
# Demonstrate format routing and processing strategy selection
test_formats = [
    ("data.csv", "CSV - Native Spark processing"),
    ("report.xlsx", "Excel - Hybrid processing"),
    ("config.json", "JSON - Native Spark processing"),
    ("catalog.xml", "XML - Native Spark processing"),
    ("document.pdf", "PDF - PyForge converter"),
    ("legacy.mdb", "Access DB - PyForge converter"),
    ("archive.dbf", "dBase - PyForge converter")
]

print("🎯 Processing Strategy Selection:")
print("=" * 60)

for filename, description in test_formats:
    file_ext = Path(filename).suffix.lower()
    strategy = forge.processor.get_processing_strategy(file_ext)
    
    print(f"\n📄 {filename}")
    print(f"   Format: {description}")
    print(f"   Strategy: {strategy}")
    
    if forge.env.is_serverless and strategy in ['serverless_native', 'databricks_native']:
        print("   ⚡ Serverless Optimizations:")
        print("      - Automatic scaling")
        print("      - Photon acceleration")
        print("      - Adaptive query execution")
    elif strategy == 'hybrid_excel':
        print("   🔄 Hybrid Processing:")
        print("      - PyForge sheet analysis")
        print("      - Spark DataFrame output")
        print("      - Column signature matching")
    elif strategy == 'pyforge_converter':
        print("   🔧 Specialized Converter:")
        print("      - Format-specific parser")
        print("      - String normalization")
        print("      - Memory optimization")

## 6. Volume-to-Volume Data Conversion

In [ ]:
    # Perform conversion
    result = forge.convert(
        input_path=source_csv,
        output_path=target_parquet,
        format_options={
            'compression': 'snappy',
            'schema_inference': True
        }
    )

In [None]:
# Example 2: Excel to Delta Lake conversion with multi-sheet handling
source_excel = f"{bronze_path}/financial_report.xlsx"
target_delta = f"{silver_path}/financial_report_delta"

# First, create a sample Excel file in Volume
print("📊 Creating sample Excel file...")
sample_excel_data = {
    'Q1_Sales': spark.createDataFrame([
        ('Product A', 10000, 'Q1'),
        ('Product B', 15000, 'Q1'),
        ('Product C', 12000, 'Q1')
    ], ['product', 'revenue', 'quarter']),
    
    'Q2_Sales': spark.createDataFrame([
        ('Product A', 12000, 'Q2'),
        ('Product B', 18000, 'Q2'),
        ('Product C', 14000, 'Q2')
    ], ['product', 'revenue', 'quarter'])
}

# Note: In real scenario, Excel file would already exist in Volume
print("\n🔄 Converting Excel to Delta Lake...")
print(f"   Source: {source_excel}")
print(f"   Target: {target_delta}")

try:
    # Perform conversion with multi-sheet options
    result = forge.convert_from_volume(
        input_path=source_excel,
        output_path=target_delta,
        output_format='delta',
        format_options={
            'combine_sheets': True,
            'sheet_matching_strategy': 'column_signature',
            'partition_by': ['quarter'],
            'overwrite_mode': 'overwrite'
        }
    )
    
    print("✅ Excel to Delta conversion successful!")
    print(f"   Sheets processed: {result['sheets_processed']}")
    print(f"   Total rows: {result['row_count']}")
    print(f"   Delta table location: {result['output_path']}")
    
except Exception as e:
    print(f"⚠️ Excel conversion simulated: {str(e)}")
    print("   In production, ensure Excel file exists in Volume")

## 7. Direct Spark DataFrame Operations with Volumes

In [None]:
# Read directly from Volume using Spark
spark = SparkSession.getActiveSession()

print("📊 Direct Spark operations on Volume data:")
print("=" * 50)

# Read CSV from Volume
csv_volume_path = f"{bronze_path}/test_data.csv"
print(f"\n📖 Reading CSV from Volume: {csv_volume_path}")

try:
    df = spark.read.csv(
        csv_volume_path,
        header=True,
        inferSchema=True
    )
    
    print(f"✅ Data loaded successfully!")
    print(f"   Rows: {df.count()}")
    print(f"   Columns: {len(df.columns)}")
    
    # Show schema
    print("\n📋 Schema:")
    df.printSchema()
    
    # Show sample data
    print("\n📊 Sample Data:")
    display(df)
    
    # Apply transformations
    print("\n🔧 Applying transformations...")
    transformed_df = df \
        .withColumn("processed_timestamp", current_timestamp()) \
        .withColumn("value_doubled", col("value") * 2) \
        .withColumn("processing_engine", lit("Databricks Serverless"))
    
    # Write back to Volume as Parquet
    output_path = f"{silver_path}/transformed_data.parquet"
    print(f"\n💾 Writing transformed data to: {output_path}")
    
    transformed_df.write \
        .mode("overwrite") \
        .parquet(output_path)
    
    print("✅ Data written successfully!")
    
except Exception as e:
    print(f"❌ Operation failed: {str(e)}")

## 8. Batch Processing Multiple Files in Volumes

In [None]:
# Batch process multiple files from Volume
print("📦 Batch Processing Example:")
print("=" * 50)

# Create sample files for batch processing
batch_files = [
    (f"{bronze_path}/sales_jan.csv", "id,product,amount\n1,A,100\n2,B,200"),
    (f"{bronze_path}/sales_feb.csv", "id,product,amount\n3,A,150\n4,B,250"),
    (f"{bronze_path}/sales_mar.csv", "id,product,amount\n5,A,200\n6,B,300")
]

# Write sample files
print("\n📝 Creating sample files for batch processing...")
for file_path, content in batch_files:
    try:
        forge.volume_handler.write_file(file_path, content.encode('utf-8'))
        print(f"   ✅ Created: {Path(file_path).name}")
    except:
        pass

# Batch convert all CSV files to Parquet
print("\n🔄 Batch converting CSV files to Parquet...")
batch_results = []

for file_path, _ in batch_files:
    filename = Path(file_path).stem
    output_path = f"{silver_path}/{filename}.parquet"
    
    try:
        print(f"\n   Processing: {filename}.csv")
        
        result = forge.convert_from_volume(
            input_path=file_path,
            output_path=output_path,
            format_options={'compression': 'snappy'}
        )
        
        batch_results.append({
            'file': filename,
            'status': 'success',
            'rows': result.get('row_count', 0),
            'duration': result.get('duration', 0)
        })
        
        print(f"      ✅ Converted in {result.get('duration', 0):.2f}s")
        
    except Exception as e:
        batch_results.append({
            'file': filename,
            'status': 'failed',
            'error': str(e)
        })
        print(f"      ❌ Failed: {str(e)}")

# Display batch results summary
print("\n📊 Batch Processing Summary:")
print("=" * 50)
success_count = sum(1 for r in batch_results if r['status'] == 'success')
print(f"✅ Successful: {success_count}/{len(batch_results)}")
print(f"❌ Failed: {len(batch_results) - success_count}/{len(batch_results)}")

if success_count > 0:
    total_rows = sum(r.get('rows', 0) for r in batch_results if r['status'] == 'success')
    total_time = sum(r.get('duration', 0) for r in batch_results if r['status'] == 'success')
    print(f"📊 Total rows processed: {total_rows}")
    print(f"⏱️  Total processing time: {total_time:.2f}s")

## 9. Advanced Volume Operations with PyForge

In [None]:
# Advanced example: Process XML with hierarchical data
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<catalog>
    <product id="1">
        <name>Widget A</name>
        <category>Electronics</category>
        <price currency="USD">29.99</price>
        <features>
            <feature>Waterproof</feature>
            <feature>Bluetooth</feature>
        </features>
    </product>
    <product id="2">
        <name>Widget B</name>
        <category>Home</category>
        <price currency="USD">45.50</price>
        <features>
            <feature>Energy Efficient</feature>
        </features>
    </product>
</catalog>
"""

xml_path = f"{bronze_path}/catalog.xml"
output_path = f"{silver_path}/catalog_flattened.parquet"

print("🌳 Processing hierarchical XML data:")
print("=" * 50)

# Write XML to Volume
try:
    forge.volume_handler.write_file(xml_path, xml_content.encode('utf-8'))
    print(f"✅ XML file written to: {xml_path}")
    
    # Convert XML to Parquet with flattening
    print("\n🔄 Converting XML to flattened Parquet...")
    
    result = forge.convert_from_volume(
        input_path=xml_path,
        output_path=output_path,
        format_options={
            'flatten_nested': True,
            'array_detection': True,
            'preserve_attributes': True,
            'root_tag': 'product'
        }
    )
    
    print("✅ XML conversion successful!")
    print(f"   Flattened structure created")
    print(f"   Arrays detected and normalized")
    print(f"   Output: {output_path}")
    
    # Read and display the flattened data
    print("\n📊 Flattened XML data:")
    flattened_df = spark.read.parquet(output_path)
    display(flattened_df)
    
except Exception as e:
    print(f"❌ XML processing failed: {str(e)}")

## 10. Performance Monitoring and Optimization

In [None]:
# Monitor conversion performance and resource usage
print("📊 Performance Monitoring Dashboard:")
print("=" * 60)

# Get conversion statistics
stats = forge.get_conversion_statistics()

if stats:
    print("\n📈 Conversion Statistics:")
    print(f"   Total conversions: {stats['total_conversions']}")
    print(f"   Successful: {stats['successful']}")
    print(f"   Failed: {stats['failed']}")
    print(f"   Average duration: {stats['avg_duration']:.2f}s")
    print(f"   Total data processed: {stats['total_bytes_processed'] / (1024*1024):.2f} MB")

# Display serverless-specific optimizations
if forge.env.is_serverless:
    print("\n⚡ Serverless Optimizations Active:")
    print("   ✅ Photon acceleration enabled")
    print("   ✅ Adaptive query execution")
    print("   ✅ Dynamic partition pruning")
    print("   ✅ Automatic scaling based on workload")
    print("   ✅ Columnar caching for repeated queries")
    
    # Show Spark configuration
    print("\n🔧 Spark Configuration (Serverless):")
    important_configs = [
        "spark.databricks.compute.type",
        "spark.databricks.photon.enabled",
        "spark.sql.adaptive.enabled",
        "spark.sql.adaptive.coalescePartitions.enabled"
    ]
    
    for config in important_configs:
        try:
            value = spark.conf.get(config)
            print(f"   {config}: {value}")
        except:
            print(f"   {config}: Not set")

# Memory usage estimation
print("\n💾 Memory Usage Guidelines:")
memory_guidelines = {
    'CSV': '~2x file size in memory',
    'JSON': '~3x file size (due to parsing)',
    'XML': '~4x file size (hierarchical structure)',
    'Excel': '~2.5x file size per sheet',
    'Parquet': '~0.5x file size (columnar compression)'
}

for format_type, guideline in memory_guidelines.items():
    print(f"   {format_type}: {guideline}")

## 11. Best Practices and Recommendations

In [None]:
print("📚 PyForge Databricks Integration Best Practices:")
print("=" * 60)

best_practices = [
    {
        "category": "🗂️ Volume Organization",
        "practices": [
            "Use bronze/silver/gold pattern for data layers",
            "Partition large datasets by date or category",
            "Use descriptive naming conventions",
            "Clean up temporary files regularly"
        ]
    },
    {
        "category": "⚡ Performance Optimization",
        "practices": [
            "Use native Databricks formats when possible (CSV, JSON, XML)",
            "Enable compression for Parquet files (snappy/zstd)",
            "Batch process files when dealing with many small files",
            "Monitor memory usage for large Excel/PDF conversions"
        ]
    },
    {
        "category": "🔒 Security & Governance",
        "practices": [
            "Use Unity Catalog for access control",
            "Implement data retention policies",
            "Audit file access and conversions",
            "Encrypt sensitive data at rest"
        ]
    },
    {
        "category": "🛠️ Development Workflow",
        "practices": [
            "Test conversions with small datasets first",
            "Use try-except blocks for robust error handling",
            "Log conversion metrics for monitoring",
            "Version control your conversion pipelines"
        ]
    }
]

for section in best_practices:
    print(f"\n{section['category']}")
    for practice in section['practices']:
        print(f"   • {practice}")

print("\n💡 Pro Tips:")
print("   1. Use serverless compute for variable workloads")
print("   2. Cache frequently accessed converted files")
print("   3. Monitor Unity Catalog usage for cost optimization")
print("   4. Leverage Delta Lake for versioned data")

## 12. Summary and Next Steps

In [None]:
print("🎯 PyForge Databricks Integration Summary:")
print("=" * 60)

print("\n✅ What we've demonstrated:")
print("   • Databricks SDK integration for Volume operations")
print("   • Serverless environment detection and optimization")
print("   • Direct Volume-to-Volume file conversion")
print("   • Format-specific processing strategies")
print("   • Batch processing capabilities")
print("   • Performance monitoring and optimization")

print("\n🚀 Next Steps:")
print("   1. Install packages: pip install pyforge-core pyforge-databricks")
print("   2. Configure Unity Catalog access and create volumes")
print("   3. Start with simple CSV/JSON conversions")
print("   4. Scale to complex formats (Excel, XML, PDF)")
print("   5. Build automated conversion pipelines")
print("   6. Monitor performance and optimize as needed")

print("\n📦 Package Information:")
print(f"   pyforge-core version: {PyForgeCore.__version__}")
print(f"   pyforge-databricks version: {PyForgeDatabricks.__version__}")
print(f"   Databricks SDK version: {forge.w.__module__.split('.')[0]}")

print("\n🎉 Happy converting with PyForge on Databricks!")