# PyForge Notebook Integration - Complete Walkthrough
## Demonstrating CLI Command Equivalents in Python Package Format

This notebook demonstrates how PyForge CLI commands can be executed through a Python package interface, with intelligent Databricks/Serverless detection and optimized processing strategies.

### Key Features:
- **Environment Detection**: Automatically detects Databricks vs serverless environments
- **Smart Format Routing**: Uses native Databricks capabilities for supported formats (CSV, JSON, XML, XLSX)
- **Fallback Processing**: Uses PyForge converters for specialized formats (MDB, DBF, PDF)
- **Unified API**: Same interface regardless of underlying processing engine

## 1. Installation and Setup

In [None]:
# Install PyForge Notebook Integration (Future Package)
# %pip install pyforge-notebook[databricks] --upgrade

# For demonstration purposes, we'll simulate the package structure
import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Simulate package imports (these would be real imports in production)
print("📦 Installing PyForge Notebook Integration...")
print("✅ Installation complete!")
print("\n🔍 Detecting execution environment...")

In [None]:
# Environment Detection Simulation
class EnvironmentDetector:
    """
    Detects the execution environment and available capabilities
    """
    
    def __init__(self):
        self.environment = self._detect_environment()
    
    def _detect_environment(self):
        env_info = {
            'is_databricks': self._is_databricks(),
            'spark_available': self._is_spark_available(),
            'spark_mode': self._get_spark_mode(),
            'runtime_version': self._get_databricks_runtime(),
            'cluster_type': self._get_cluster_type(),
            'processing_strategy': None
        }
        
        # Determine optimal processing strategy
        if env_info['is_databricks'] and env_info['spark_available']:
            env_info['processing_strategy'] = 'databricks_optimized'
        elif env_info['spark_available']:
            env_info['processing_strategy'] = 'spark_local'
        else:
            env_info['processing_strategy'] = 'pandas_local'
            
        return env_info
    
    def _is_databricks(self):
        return (
            'DATABRICKS_RUNTIME_VERSION' in os.environ or
            'SPARK_LOCAL_HOSTNAME' in os.environ or
            self._check_databricks_imports()
        )
    
    def _check_databricks_imports(self):
        try:
            import pyspark
            from pyspark.sql import SparkSession
            # Check if we can access dbutils (Databricks-specific)
            spark = SparkSession.getActiveSession()
            if spark and hasattr(spark, 'sparkContext'):
                return 'databricks' in str(spark.sparkContext.getConf().getAll())
        except ImportError:
            pass
        return False
    
    def _is_spark_available(self):
        try:
            import pyspark
            from pyspark.sql import SparkSession
            return SparkSession.getActiveSession() is not None
        except ImportError:
            return False
    
    def _get_spark_mode(self):
        if not self._is_spark_available():
            return None
        try:
            from pyspark.sql import SparkSession
            spark = SparkSession.getActiveSession()
            return spark.conf.get("spark.api.mode", "classic")
        except:
            return "classic"
    
    def _get_databricks_runtime(self):
        return os.environ.get('DATABRICKS_RUNTIME_VERSION', 'Not Databricks')
    
    def _get_cluster_type(self):
        if not self._is_databricks():
            return 'local'
        # Simplified cluster type detection
        return os.environ.get('CLUSTER_TYPE', 'standard')

# Initialize environment detector
env_detector = EnvironmentDetector()
env = env_detector.environment

print("Environment Detection Results:")
print(f"🌐 Environment Type: {'Databricks' if env['is_databricks'] else 'Local/Serverless'}")
print(f"⚡ Spark Available: {env['spark_available']}")
print(f"🔧 Spark Mode: {env['spark_mode'] or 'N/A'}")
print(f"🏷️  Runtime Version: {env['runtime_version']}")
print(f"🖥️  Cluster Type: {env['cluster_type']}")
print(f"🎯 Processing Strategy: {env['processing_strategy']}")

## 2. PyForge Notebook Integration Class

This simulates the main `PyForge` class that would be available in the `pyforge-notebook` package.

In [None]:
import pandas as pd
from datetime import datetime
import json

class PyForgeNotebook:
    """
    Main PyForge Notebook Integration Class
    
    Provides a unified interface for data conversion with intelligent
    environment detection and optimization.
    """
    
    def __init__(self, auto_detect_environment=True, **config):
        self.env_detector = EnvironmentDetector() if auto_detect_environment else None
        self.environment = self.env_detector.environment if self.env_detector else {}
        self.config = config
        self.last_conversion_metadata = None
        self.conversion_stats = None
        
        # Initialize format processors
        self._init_processors()
        
        print(f"🚀 PyForge initialized with {self.environment.get('processing_strategy', 'unknown')} strategy")
    
    def _init_processors(self):
        """Initialize format-specific processors based on environment"""
        strategy = self.environment.get('processing_strategy', 'pandas_local')
        
        if strategy == 'databricks_optimized':
            self.processor = DatabricksOptimizedProcessor()
        elif strategy == 'spark_local':
            self.processor = SparkLocalProcessor()
        else:
            self.processor = PandasLocalProcessor()
    
    def convert(self, input_path, output_format="dataframe", **options):
        """
        Main conversion method - equivalent to CLI 'pyforge convert'
        
        Args:
            input_path: Path to input file
            output_format: 'dataframe', 'spark_dataframe', 'delta_table', or file extension
            **options: Format-specific options
        
        Returns:
            DataFrame or path to output file
        """
        start_time = datetime.now()
        
        # Detect file format
        file_ext = Path(input_path).suffix.lower()
        
        print(f"🔄 Converting {Path(input_path).name} ({file_ext}) to {output_format}...")
        
        # Route to appropriate processor
        result = self.processor.process_file(input_path, file_ext, output_format, **options)
        
        # Record conversion stats
        end_time = datetime.now()
        self.conversion_stats = {
            'duration': (end_time - start_time).total_seconds(),
            'input_file': input_path,
            'output_format': output_format,
            'processing_strategy': self.environment.get('processing_strategy'),
            'success': True
        }
        
        print(f"✅ Conversion completed in {self.conversion_stats['duration']:.2f}s")
        return result
    
    def get_file_info(self, file_path):
        """Equivalent to CLI 'pyforge info'"""
        path_obj = Path(file_path)
        
        # Simulate file analysis
        info = {
            'filename': path_obj.name,
            'extension': path_obj.suffix,
            'size_bytes': 'Unknown (simulated)',
            'format_detected': path_obj.suffix.lstrip('.').upper(),
            'supported': path_obj.suffix.lower() in ['.csv', '.xlsx', '.json', '.xml', '.pdf', '.mdb', '.dbf'],
            'processing_engine': self._get_processing_engine(path_obj.suffix.lower()),
            'estimated_memory_usage': 'Variable'
        }
        
        return info
    
    def list_supported_formats(self):
        """Equivalent to CLI 'pyforge formats'"""
        formats = [
            {'input': 'CSV (.csv)', 'output': 'DataFrame/Parquet', 'engine': self._get_processing_engine('.csv')},
            {'input': 'Excel (.xlsx)', 'output': 'DataFrame/Parquet', 'engine': self._get_processing_engine('.xlsx')},
            {'input': 'JSON (.json)', 'output': 'DataFrame/Parquet', 'engine': self._get_processing_engine('.json')},
            {'input': 'XML (.xml)', 'output': 'DataFrame/Parquet', 'engine': self._get_processing_engine('.xml')},
            {'input': 'PDF (.pdf)', 'output': 'Text DataFrame', 'engine': self._get_processing_engine('.pdf')},
            {'input': 'Access (.mdb)', 'output': 'DataFrame/Parquet', 'engine': self._get_processing_engine('.mdb')},
            {'input': 'dBase (.dbf)', 'output': 'DataFrame/Parquet', 'engine': self._get_processing_engine('.dbf')}
        ]
        return formats
    
    def validate_file(self, file_path):
        """Equivalent to CLI 'pyforge validate'"""
        path_obj = Path(file_path)
        
        errors = []
        warnings = []
        
        # Basic validation
        if not path_obj.suffix:
            errors.append("File has no extension")
        
        if path_obj.suffix.lower() not in ['.csv', '.xlsx', '.json', '.xml', '.pdf', '.mdb', '.dbf']:
            warnings.append(f"Format {path_obj.suffix} may not be fully supported")
        
        status = "valid" if not errors else "invalid"
        
        return {
            'status': status,
            'errors': errors,
            'warnings': warnings,
            'can_process': len(errors) == 0
        }
    
    def _get_processing_engine(self, file_ext):
        """Determine which processing engine would be used for a file format"""
        strategy = self.environment.get('processing_strategy', 'pandas_local')
        
        # Native Databricks support
        if strategy == 'databricks_optimized' and file_ext in ['.csv', '.json', '.xml', '.xlsx']:
            return 'Databricks Native'
        
        # PyForge converters for specialized formats
        if file_ext in ['.pdf', '.mdb', '.dbf']:
            return 'PyForge Converter'
        
        # Default pandas processing
        return 'Pandas/PyArrow'
    
    def get_environment_info(self):
        """Return detailed environment information"""
        return {
            'environment_type': 'Databricks' if self.environment.get('is_databricks') else 'Local/Serverless',
            'spark_available': self.environment.get('spark_available', False),
            'databricks_runtime': self.environment.get('runtime_version', 'N/A'),
            'cluster_type': self.environment.get('cluster_type', 'local'),
            'processing_strategy': self.environment.get('processing_strategy', 'pandas_local'),
            'spark_mode': self.environment.get('spark_mode', 'N/A')
        }

print("✅ PyForgeNotebook class defined")

## 3. Processing Engine Classes

These classes demonstrate how different processing strategies would be implemented based on the detected environment.

In [None]:
class BaseProcessor:
    """Base class for all processors"""
    
    def process_file(self, input_path, file_ext, output_format, **options):
        raise NotImplementedError

class DatabricksOptimizedProcessor(BaseProcessor):
    """Processor optimized for Databricks environment"""
    
    def __init__(self):
        self.name = "Databricks Optimized"
        print(f"🔧 Initialized {self.name} processor")
    
    def process_file(self, input_path, file_ext, output_format, **options):
        if file_ext in ['.csv', '.json', '.xml', '.xlsx']:
            return self._process_with_databricks_native(input_path, file_ext, output_format, **options)
        else:
            return self._process_with_pyforge_fallback(input_path, file_ext, output_format, **options)
    
    def _process_with_databricks_native(self, input_path, file_ext, output_format, **options):
        print(f"⚡ Using Databricks native processing for {file_ext}")
        
        # Simulate Spark DataFrame creation
        sample_data = self._generate_sample_data(file_ext, input_path)
        
        if output_format == "spark_dataframe":
            print("📊 Returning Spark DataFrame")
            return sample_data  # In reality, this would be a Spark DataFrame
        else:
            print("🐼 Converting to Pandas DataFrame")
            return sample_data  # Convert Spark DF to Pandas
    
    def _process_with_pyforge_fallback(self, input_path, file_ext, output_format, **options):
        print(f"🔄 Using PyForge converter for {file_ext} (specialized format)")
        return self._generate_sample_data(file_ext, input_path)
    
    def _generate_sample_data(self, file_ext, input_path):
        """Generate sample data based on file type"""
        if file_ext == '.csv':
            return pd.DataFrame({
                'id': [1, 2, 3, 4, 5],
                'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
                'value': [10.5, 20.3, 15.7, 30.1, 25.9],
                'date': pd.date_range('2024-01-01', periods=5)
            })
        elif file_ext == '.xlsx':
            return pd.DataFrame({
                'quarter': ['Q1', 'Q2', 'Q3', 'Q4'],
                'revenue': [100000, 120000, 110000, 130000],
                'expenses': [80000, 85000, 82000, 90000],
                'profit': [20000, 35000, 28000, 40000]
            })
        elif file_ext == '.json':
            return pd.DataFrame({
                'user_id': [101, 102, 103],
                'username': ['user1', 'user2', 'user3'],
                'settings': ['{"theme": "dark"}', '{"theme": "light"}', '{"theme": "auto"}']
            })
        elif file_ext == '.xml':
            return pd.DataFrame({
                'product_id': ['P001', 'P002', 'P003'],
                'product_name': ['Widget A', 'Widget B', 'Widget C'],
                'category': ['Electronics', 'Home', 'Electronics'],
                'price': [29.99, 45.50, 15.75]
            })
        elif file_ext == '.pdf':
            return pd.DataFrame({
                'page_number': [1, 2, 3],
                'text_content': [
                    'This is the content of page 1...',
                    'This is the content of page 2...',
                    'This is the content of page 3...'
                ]
            })
        else:
            return pd.DataFrame({'data': ['Sample data from ' + str(input_path)]})

class SparkLocalProcessor(BaseProcessor):
    """Processor for local Spark environment"""
    
    def __init__(self):
        self.name = "Spark Local"
        print(f"🔧 Initialized {self.name} processor")
    
    def process_file(self, input_path, file_ext, output_format, **options):
        print(f"⚡ Using local Spark processing for {file_ext}")
        # Use similar logic as Databricks but without Databricks-specific optimizations
        return DatabricksOptimizedProcessor()._generate_sample_data(file_ext, input_path)

class PandasLocalProcessor(BaseProcessor):
    """Processor for local pandas environment"""
    
    def __init__(self):
        self.name = "Pandas Local"
        print(f"🔧 Initialized {self.name} processor")
    
    def process_file(self, input_path, file_ext, output_format, **options):
        print(f"🐼 Using pandas processing for {file_ext}")
        return DatabricksOptimizedProcessor()._generate_sample_data(file_ext, input_path)

print("✅ Processing engine classes defined")

## 4. Initialize PyForge and Demonstrate CLI Command Equivalents

In [None]:
# Initialize PyForge with automatic environment detection
forge = PyForgeNotebook(auto_detect_environment=True)

# Display environment information
env_info = forge.get_environment_info()
print("\n🌍 Environment Information:")
for key, value in env_info.items():
    print(f"  {key.replace('_', ' ').title()}: {value}")

## 5. CLI Command Equivalents

### 5.1 `pyforge info` equivalent

In [None]:
# Equivalent to: pyforge info data.xlsx
print("📋 File Information Analysis (equivalent to 'pyforge info'):")
print("=" * 60)

test_files = [
    "/path/to/sales_data.xlsx",
    "/path/to/customer_data.csv",
    "/path/to/report.pdf",
    "/path/to/legacy_database.mdb"
]

for file_path in test_files:
    file_info = forge.get_file_info(file_path)
    print(f"\n📄 {file_info['filename']}")
    print(f"   Format: {file_info['format_detected']}")
    print(f"   Supported: {'✅' if file_info['supported'] else '❌'}")
    print(f"   Processing Engine: {file_info['processing_engine']}")

### 5.2 `pyforge formats` equivalent

In [None]:
# Equivalent to: pyforge formats
print("\n📊 Supported Formats (equivalent to 'pyforge formats'):")
print("=" * 70)

supported_formats = forge.list_supported_formats()
for fmt in supported_formats:
    print(f"  {fmt['input']:<20} → {fmt['output']:<25} [{fmt['engine']}]")

### 5.3 `pyforge validate` equivalent

In [None]:
# Equivalent to: pyforge validate
print("\n🔍 File Validation (equivalent to 'pyforge validate'):")
print("=" * 60)

validation_files = [
    "/path/to/valid_data.csv",
    "/path/to/no_extension",
    "/path/to/unsupported.xyz"
]

for file_path in validation_files:
    result = forge.validate_file(file_path)
    status_icon = "✅" if result['status'] == 'valid' else "❌"
    print(f"\n{status_icon} {Path(file_path).name}: {result['status'].upper()}")
    
    if result['errors']:
        for error in result['errors']:
            print(f"   ❌ Error: {error}")
    
    if result['warnings']:
        for warning in result['warnings']:
            print(f"   ⚠️  Warning: {warning}")

## 6. Data Conversion Examples

### 6.1 CSV Processing with Environment-Specific Optimization

In [None]:
# Equivalent to: pyforge convert data.csv output.parquet
print("\n🔄 CSV Conversion Example:")
print("=" * 50)

csv_df = forge.convert(
    input_path="/path/to/sales_data.csv",
    output_format="dataframe"
)

print(f"\n📊 Converted CSV Data (Shape: {csv_df.shape}):")
print(csv_df.head())
print(f"\n📈 Data Types:")
print(csv_df.dtypes)

### 6.2 Excel Multi-Sheet Processing

In [None]:
# Equivalent to: pyforge convert financial_report.xlsx output.parquet --combine-sheets
print("\n📈 Excel Multi-Sheet Conversion:")
print("=" * 50)

excel_df = forge.convert(
    input_path="/path/to/financial_report.xlsx",
    output_format="dataframe",
    excel_options={
        'combine_sheets': True,
        'sheet_matching_strategy': 'column_signature'
    }
)

print(f"\n📊 Converted Excel Data (Shape: {excel_df.shape}):")
print(excel_df.head())

# Show conversion metadata (simulated)
print("\n📋 Sheet Processing Summary:")
print("  - Q1 Data: 1,000 rows processed")
print("  - Q2 Data: 1,200 rows processed")
print("  - Q3 Data: 950 rows processed")
print("  - Q4 Data: 1,100 rows processed")

### 6.3 JSON Processing with Nested Structure Handling

In [None]:
# Equivalent to: pyforge convert api_data.json output.parquet
print("\n🔗 JSON Conversion with Nested Structures:")
print("=" * 50)

json_df = forge.convert(
    input_path="/path/to/api_data.json",
    output_format="dataframe",
    json_options={
        'flatten_nested': True,
        'normalize_arrays': True
    }
)

print(f"\n📊 Converted JSON Data (Shape: {json_df.shape}):")
print(json_df.head())
print("\n🔍 JSON flattening automatically handled nested objects and arrays")

### 6.4 XML Processing with Hierarchical Flattening

In [None]:
# Equivalent to: pyforge convert catalog.xml output.parquet --flatten-nested
print("\n🌳 XML Hierarchical Data Processing:")
print("=" * 50)

xml_df = forge.convert(
    input_path="/path/to/product_catalog.xml",
    output_format="dataframe",
    xml_options={
        'flatten_nested': True,
        'array_detection': True,
        'preserve_attributes': True
    }
)

print(f"\n📊 Converted XML Data (Shape: {xml_df.shape}):")
print(xml_df.head())
print("\n🌿 XML structure automatically analyzed and flattened")

### 6.5 PDF Text Extraction

In [None]:
# Equivalent to: pyforge convert document.pdf output.txt --pages 1-5
print("\n📄 PDF Text Extraction:")
print("=" * 50)

pdf_df = forge.convert(
    input_path="/path/to/annual_report.pdf",
    output_format="dataframe",
    pdf_options={
        'page_range': '1-5',
        'extract_metadata': True,
        'preserve_formatting': False
    }
)

print(f"\n📊 Extracted PDF Text (Shape: {pdf_df.shape}):")
print(pdf_df.head())
print("\n📖 PDF text extraction with page-level granularity")

## 7. Advanced Features

### 7.1 Batch Processing Multiple Files

In [None]:
# Batch processing equivalent (not available in CLI but useful for notebooks)
print("\n🔄 Batch Processing Multiple Files:")
print("=" * 50)

batch_files = [
    "/path/to/sales_q1.xlsx",
    "/path/to/sales_q2.xlsx", 
    "/path/to/sales_q3.xlsx",
    "/path/to/sales_q4.xlsx"
]

batch_results = []
for file_path in batch_files:
    print(f"\n🔄 Processing {Path(file_path).name}...")
    df = forge.convert(file_path, output_format="dataframe")
    batch_results.append({
        'file': Path(file_path).name,
        'rows': len(df),
        'columns': len(df.columns),
        'dataframe': df
    })

print("\n📊 Batch Processing Summary:")
for result in batch_results:
    print(f"  {result['file']}: {result['rows']} rows, {result['columns']} columns")

# Combine all quarters
combined_df = pd.concat([r['dataframe'] for r in batch_results], ignore_index=True)
print(f"\n🔗 Combined Dataset: {combined_df.shape[0]} total rows")

### 7.2 Environment-Specific Optimizations

In [None]:
print("\n⚡ Environment-Specific Processing Demonstration:")
print("=" * 60)

# Demonstrate different processing strategies based on environment
test_file = "/path/to/large_dataset.csv"

if forge.environment.get('is_databricks'):
    print("🏢 Databricks Environment Detected:")
    print("  ✅ Using Spark's distributed CSV reader")
    print("  ✅ Leveraging cluster compute resources")
    print("  ✅ Automatic partitioning and optimization")
    
    # Simulate Databricks-specific processing
    df = forge.convert(
        test_file,
        output_format="spark_dataframe",
        databricks_options={
            'adaptive_query_execution': True,
            'columnar_cache': True,
            'partition_strategy': 'auto'
        }
    )
    print("  📊 Result: Spark DataFrame optimized for distributed processing")
    
elif forge.environment.get('spark_available'):
    print("⚡ Local Spark Environment Detected:")
    print("  ✅ Using local Spark session")
    print("  ✅ Memory-optimized processing")
    
    df = forge.convert(test_file, output_format="spark_dataframe")
    print("  📊 Result: Local Spark DataFrame")
    
else:
    print("🐼 Pandas Environment Detected:")
    print("  ✅ Using pandas with chunked reading for large files")
    print("  ✅ Memory-efficient processing")
    
    df = forge.convert(
        test_file,
        output_format="dataframe",
        pandas_options={
            'chunksize': 10000,
            'low_memory': True
        }
    )
    print("  📊 Result: Pandas DataFrame with memory optimization")

print(f"\n📈 Processing completed using {forge.environment.get('processing_strategy')} strategy")

### 7.3 Format Detection and Automatic Routing

In [None]:
print("\n🔍 Automatic Format Detection and Processing Route Selection:")
print("=" * 70)

# Demonstrate how different formats are routed to appropriate processors
test_formats = [
    ("/path/to/data.csv", "Native processing in Databricks/Spark"),
    ("/path/to/spreadsheet.xlsx", "Hybrid processing (PyForge + Spark output)"),
    ("/path/to/config.json", "Native JSON processing"),
    ("/path/to/catalog.xml", "Native XML processing with flattening"),
    ("/path/to/report.pdf", "PyForge specialized converter"),
    ("/path/to/legacy.mdb", "PyForge specialized converter"),
    ("/path/to/old_data.dbf", "PyForge specialized converter")
]

for file_path, expected_processing in test_formats:
    file_ext = Path(file_path).suffix.lower()
    processing_engine = forge._get_processing_engine(file_ext)
    
    print(f"\n📄 {Path(file_path).name}")
    print(f"   Extension: {file_ext}")
    print(f"   Engine: {processing_engine}")
    print(f"   Strategy: {expected_processing}")
    
    # Show appropriate processing symbols
    if "Native" in processing_engine:
        print("   🚀 Optimized for distributed processing")
    elif "PyForge" in processing_engine:
        print("   🔧 Specialized format converter")
    else:
        print("   🐼 Standard pandas processing")

## 8. Performance and Statistics

In [None]:
print("\n📊 Performance Statistics and Conversion Summary:")
print("=" * 60)

# Display last conversion statistics
if forge.conversion_stats:
    stats = forge.conversion_stats
    print(f"\n⏱️  Last Conversion Performance:")
    print(f"   Input File: {Path(stats['input_file']).name}")
    print(f"   Output Format: {stats['output_format']}")
    print(f"   Processing Strategy: {stats['processing_strategy']}")
    print(f"   Duration: {stats['duration']:.2f} seconds")
    print(f"   Status: {'✅ Success' if stats['success'] else '❌ Failed'}")

# Environment capabilities summary
print(f"\n🌍 Environment Capabilities Summary:")
capabilities = {
    'Distributed Processing': forge.environment.get('spark_available', False),
    'Databricks Native': forge.environment.get('is_databricks', False),
    'Delta Lake Support': forge.environment.get('is_databricks', False),
    'Large File Optimization': True,
    'Batch Processing': True,
    'Progress Tracking': True
}

for capability, available in capabilities.items():
    status = "✅" if available else "⚠️"
    print(f"   {status} {capability}")

print(f"\n🎯 Optimal Use Cases for Current Environment:")
if forge.environment.get('is_databricks'):
    print("   • Large-scale data processing (GB-TB range)")
    print("   • Multi-format data ingestion pipelines")
    print("   • Real-time data transformation workflows")
    print("   • Delta Lake integration for data lakes")
elif forge.environment.get('spark_available'):
    print("   • Medium-scale data processing (MB-GB range)")
    print("   • Local distributed processing")
    print("   • Development and testing workflows")
else:
    print("   • Small to medium datasets (MB range)")
    print("   • Quick data exploration and analysis")
    print("   • Specialized format conversion")
    print("   • Prototype and development work")

## 9. Migration Guide: CLI to Notebook

### Common CLI Commands and Their Notebook Equivalents

In [None]:
print("\n🔄 CLI to Notebook Migration Guide:")
print("=" * 60)

migration_examples = [
    {
        'cli': 'pyforge convert data.csv output.parquet',
        'notebook': 'df = forge.convert("data.csv", output_format="dataframe")',
        'description': 'Basic file conversion'
    },
    {
        'cli': 'pyforge info document.pdf',
        'notebook': 'info = forge.get_file_info("document.pdf")',
        'description': 'File information and metadata'
    },
    {
        'cli': 'pyforge formats',
        'notebook': 'formats = forge.list_supported_formats()',
        'description': 'List supported formats'
    },
    {
        'cli': 'pyforge validate data.xlsx',
        'notebook': 'result = forge.validate_file("data.xlsx")',
        'description': 'File validation'
    },
    {
        'cli': 'pyforge convert report.pdf --pages 1-10',
        'notebook': 'df = forge.convert("report.pdf", pdf_options={"page_range": "1-10"})',
        'description': 'PDF with page range'
    },
    {
        'cli': 'pyforge convert data.xlsx --combine-sheets',
        'notebook': 'df = forge.convert("data.xlsx", excel_options={"combine_sheets": True})',
        'description': 'Excel multi-sheet processing'
    }
]

for i, example in enumerate(migration_examples, 1):
    print(f"\n{i}. {example['description']}:")
    print(f"   CLI: {example['cli']}")
    print(f"   Notebook: {example['notebook']}")

print("\n✨ Additional Notebook-Specific Benefits:")
notebook_benefits = [
    "Direct DataFrame manipulation and analysis",
    "Interactive data exploration with display()",
    "Seamless integration with visualization libraries",
    "Batch processing capabilities",
    "Environment-aware optimizations",
    "Progress tracking and real-time feedback",
    "Memory-efficient processing strategies"
]

for benefit in notebook_benefits:
    print(f"   ✅ {benefit}")

## 10. Summary and Next Steps

In [None]:
print("\n🎯 PyForge Notebook Integration Summary:")
print("=" * 60)

print("\n🏗️ Architecture Highlights:")
print("   ✅ Intelligent environment detection (Databricks vs local)")
print("   ✅ Format-specific processing optimization")
print("   ✅ Unified API regardless of backend")
print("   ✅ Seamless CLI command equivalents")
print("   ✅ Native Databricks integration for supported formats")
print("   ✅ PyForge fallback for specialized formats")

print("\n📊 Supported Workflows:")
print("   🔄 Single file conversions")
print("   📦 Batch processing")
print("   🔍 File analysis and validation")
print("   📈 Performance monitoring")
print("   🎛️ Environment-specific optimizations")

print("\n🚀 Implementation Roadmap:")
roadmap_phases = [
    "Phase 1: Core integration and environment detection",
    "Phase 2: Databricks optimizations and native processing",
    "Phase 3: Advanced features and magic commands",
    "Phase 4: Production hardening and performance tuning"
]

for phase in roadmap_phases:
    print(f"   📋 {phase}")

print("\n💡 Key Benefits:")
print("   • Preserve PyForge's sophisticated conversion algorithms")
print("   • Leverage Databricks native capabilities where beneficial")
print("   • Provide seamless notebook integration")
print("   • Maintain performance across different environments")
print("   • Enable data scientists to focus on analysis, not conversion")

print("\n🎉 Ready for production implementation!")
print("   This architecture provides a solid foundation for")
print("   bringing PyForge's powerful conversion capabilities")
print("   directly into notebook workflows with intelligent")
print("   optimization based on the execution environment.")