In [ ]:
# DBTITLE 1,Generate TestPyPI Summary Report\n# =============================================================================\n# TESTPYPI SUMMARY REPORT GENERATION\n# =============================================================================\n\ndef generate_testpypi_summary_report(test_results, total_duration):\n    \"\"\"Generate comprehensive summary report of TestPyPI conversion tests.\"\"\"\n    \n    df_results = pd.DataFrame(test_results)\n    \n    # Overall statistics\n    total_files = len(test_results)\n    successful = len(df_results[df_results['status'] == 'SUCCESS']) if len(df_results) > 0 else 0\n    failed = len(df_results[df_results['status'] == 'FAILED']) if len(df_results) > 0 else 0\n    skipped = len(df_results[df_results['status'] == 'SKIPPED']) if len(df_results) > 0 else 0\n    timeout = len(df_results[df_results['status'] == 'TIMEOUT']) if len(df_results) > 0 else 0\n    errors = len(df_results[df_results['status'] == 'ERROR']) if len(df_results) > 0 else 0\n    \n    # Calculate success rate excluding skipped files\n    files_attempted = total_files - skipped\n    success_rate = round((successful / files_attempted) * 100, 1) if files_attempted > 0 else 0\n    \n    # Performance statistics\n    successful_tests = df_results[df_results['status'] == 'SUCCESS'] if len(df_results) > 0 else pd.DataFrame()\n    avg_duration = round(successful_tests['duration_seconds'].mean(), 2) if len(successful_tests) > 0 else 0\n    total_conversion_time = round(df_results['duration_seconds'].sum(), 2) if len(df_results) > 0 else 0\n    total_size_processed = round(successful_tests['size_mb'].sum(), 2) if len(successful_tests) > 0 else 0\n    \n    # PySpark usage statistics\n    pyspark_used = len(df_results[df_results['converter_used'] == 'PySpark']) if len(df_results) > 0 else 0\n    \n    # Summary dictionary\n    summary = {\n        'test_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n        'environment': 'Databricks Serverless',\n        'pyforge_version': PYFORGE_VERSION,\n        'installation_source': 'TestPyPI',\n        'testpypi_url': TESTPYPI_URL,\n        'databricks_username': DATABRICKS_USERNAME,\n        'total_files_tested': total_files,\n        'files_attempted': files_attempted,\n        'successful_conversions': successful,\n        'failed_conversions': failed,\n        'skipped_files': skipped,\n        'timeout_files': timeout,\n        'error_files': errors,\n        'success_rate_percent': success_rate,\n        'total_test_duration_seconds': total_duration,\n        'total_conversion_time_seconds': total_conversion_time,\n        'average_conversion_time_seconds': avg_duration,\n        'total_data_processed_mb': total_size_processed,\n        'pyspark_conversions': pyspark_used,\n        'pyspark_available': pyspark_available,\n        'sample_datasets_path': SAMPLE_DATASETS_PATH,\n        'output_directory': CONVERTED_OUTPUT_PATH\n    }\n    \n    return summary, df_results\n\n# Generate summary report\nsummary_report, df_detailed_results = generate_testpypi_summary_report(test_results, total_test_duration)\n\n# Display summary report\nprint(\"=\" * 80)\nprint(\"üéØ PYFORGE CLI TESTPYPI TESTING SUMMARY\")\nprint(\"=\" * 80)\n\nprint(f\"üìÖ Test Timestamp: {summary_report['test_timestamp']}\")\nprint(f\"üè¢ Environment: {summary_report['environment']}\")\nprint(f\"üì¶ PyForge Version: {summary_report['pyforge_version']} (TestPyPI)\")\nprint(f\"üîó Installation Source: {summary_report['installation_source']}\")\nprint(f\"üåê TestPyPI URL: {summary_report['testpypi_url']}\")\nprint(f\"üë§ Databricks Username: {summary_report['databricks_username']}\")\n\nprint(\"\\nüìä OVERALL RESULTS:\")\nprint(f\"   Total Files: {summary_report['total_files_tested']}\")\nprint(f\"   Files Attempted: {summary_report['files_attempted']}\")\nprint(f\"   ‚úÖ Successful: {summary_report['successful_conversions']}\")\nprint(f\"   ‚ùå Failed: {summary_report['failed_conversions']}\")\nprint(f\"   ‚è≠Ô∏è  Skipped: {summary_report['skipped_files']}\")\nprint(f\"   ‚è∞ Timeout: {summary_report['timeout_files']}\")\nprint(f\"   üö´ Errors: {summary_report['error_files']}\")\nprint(f\"   üéØ Success Rate: {summary_report['success_rate_percent']}% (of attempted files)\")\n\nprint(\"\\n‚è±Ô∏è  PERFORMANCE METRICS:\")\nprint(f\"   Total Test Duration: {summary_report['total_test_duration_seconds']}s\")\nprint(f\"   Total Conversion Time: {summary_report['total_conversion_time_seconds']}s\")\nprint(f\"   Average Conversion Time: {summary_report['average_conversion_time_seconds']}s\")\nprint(f\"   Total Data Processed: {summary_report['total_data_processed_mb']} MB\")\n\nprint(\"\\nüöÄ TESTPYPI INTEGRATION:\")\nprint(f\"   Installation Source: TestPyPI (development repository)\")\nprint(f\"   Development Version: {summary_report['pyforge_version']}\")\nprint(f\"   PySpark Available: {'‚úÖ Yes' if summary_report['pyspark_available'] else '‚ùå No'}\")\nprint(f\"   PySpark Conversions: {summary_report['pyspark_conversions']}\")\nif summary_report['pyspark_available']:\n    print(f\"   ‚úÖ PyForge CLI successfully detected and used PySpark in Databricks Serverless!\")\n\nprint(\"\\nüìã RESULTS BY FILE TYPE:\")\nif len(df_detailed_results) > 0:\n    type_summary = df_detailed_results.groupby('file_type')['status'].value_counts().unstack(fill_value=0)\n    display(type_summary)\n    \n    print(\"\\nüìä DETAILED TESTPYPI RESULTS:\")\n    display(df_detailed_results[['file_name', 'file_type', 'status', 'duration_seconds', 'size_mb', 'converter_used', 'testpypi_version', 'error_message']])\n    \n    # Show failed conversions details\n    failed_tests = df_detailed_results[df_detailed_results['status'].isin(['FAILED', 'ERROR', 'TIMEOUT'])]\n    if len(failed_tests) > 0:\n        print(f\"\\n‚ùå FAILED CONVERSIONS DETAILS ({len(failed_tests)} failures):\")\n        display(failed_tests[['file_name', 'file_type', 'status', 'testpypi_version', 'error_message']])\n    \n    # Show skipped files\n    skipped_tests = df_detailed_results[df_detailed_results['status'] == 'SKIPPED']\n    if len(skipped_tests) > 0:\n        print(f\"\\n‚è≠Ô∏è  SKIPPED FILES ({len(skipped_tests)} files):\")\n        display(skipped_tests[['file_name', 'file_type', 'error_message']])\nelse:\n    print(\"   No test results to display\")\n\nprint(\"\\nüéâ TESTPYPI TESTING COMPLETED!\")\nprint(f\"üì¶ PyForge CLI {summary_report['pyforge_version']} from TestPyPI tested successfully!\")\nprint(f\"üöÄ Ready for production release after validation!\")\n\nprint(\"=\" * 80)"

In [ ]:
# DBTITLE 1,Comprehensive Conversion Testing (TestPyPI)\n# =============================================================================\n# BULK CONVERSION TESTING WITH TESTPYPI VERSION\n# =============================================================================\n\ndef run_testpypi_conversion_test(file_info):\n    \"\"\"Run conversion test for a single file using TestPyPI version.\"\"\"\n    file_path = file_info['file_path']\n    file_type = file_info['file_type']\n    file_name = file_info['file_name']\n    file_ext = file_info['extension']\n    \n    # Create output path in volume (with testpypi suffix)\n    output_name = file_name.split('.')[0]\n    output_dir = f\"{CONVERTED_OUTPUT_PATH}/{file_info['category']}\"\n    output_path = f\"{output_dir}/{output_name}.parquet\"\n    \n    # Create output directory if it doesn't exist\n    try:\n        dbutils.fs.mkdirs(output_dir.replace('/Volumes/', 'dbfs:/Volumes/'))\n    except Exception as e:\n        print(f\"   ‚ö†Ô∏è  Warning creating directory {output_dir}: {e}\")\n    \n    # Build conversion command (removed --verbose flag as it's not supported)\n    force_flag = '--force' if FORCE_CONVERSION else ''\n    pyspark_flag = '--force-pyspark' if USE_PYSPARK_FOR_CSV and file_ext == '.csv' else ''\n    excel_flag = '--separate' if file_ext in ['.xlsx', '.xls'] else ''\n    \n    cmd = [\n        'pyforge', 'convert', file_path, output_path, \n        '--format', 'parquet', force_flag, pyspark_flag, excel_flag\n    ]\n    cmd = [arg for arg in cmd if arg]  # Remove empty strings\n    \n    print(f\"\\nüîÑ Converting {file_name} ({file_type}) with TestPyPI version...\")\n    print(f\"   File size: {file_info.get('size_readable', 'Unknown')}\")\n    print(f\"   Output dir: {output_dir}\")\n    print(f\"   PyForge version: {PYFORGE_VERSION} (TestPyPI)\")\n    print(f\"   Command: {' '.join(cmd)}\")\n    \n    # Skip PDF files if they're known to have issues\n    if file_ext == '.pdf':\n        print(f\"   ‚ö†Ô∏è  Skipping PDF file - known conversion issues\")\n        return {\n            'file_name': file_name,\n            'file_type': file_type,\n            'status': 'SKIPPED',\n            'duration_seconds': 0,\n            'error_message': 'PDF conversion temporarily disabled due to known issues',\n            'output_path': None,\n            'size_mb': file_info.get('size_mb', 0),\n            'command': ' '.join(cmd),\n            'converter_used': 'N/A',\n            'testpypi_version': PYFORGE_VERSION,\n            'observation': {\n                'file': file_name,\n                'type': file_type,\n                'status': 'SKIPPED',\n                'reason': 'PDF conversion issues',\n                'testpypi_version': PYFORGE_VERSION\n            }\n        }\n    \n    # Log test observation\n    observation = {\n        'file': file_name,\n        'type': file_type,\n        'size': file_info.get('size_readable', 'Unknown'),\n        'start_time': datetime.now().strftime('%H:%M:%S'),\n        'testpypi_version': PYFORGE_VERSION\n    }\n    \n    try:\n        start_time = time.time()\n        \n        # Set timeout based on file size\n        file_size_mb = file_info.get('size_mb', 0)\n        if file_size_mb > 100:\n            timeout = 600  # 10 minutes for large files\n        elif file_size_mb > 10:\n            timeout = 300  # 5 minutes for medium files\n        else:\n            timeout = 120  # 2 minutes for small files\n        \n        print(f\"   Timeout: {timeout}s\")\n        \n        # Run conversion\n        result = subprocess.run(\n            cmd, \n            capture_output=True, \n            text=True, \n            timeout=timeout\n        )\n        \n        end_time = time.time()\n        duration = round(end_time - start_time, 2)\n        \n        if result.returncode == 0:\n            status = 'SUCCESS'\n            error_message = None\n            # Check if PySpark was used for CSV files\n            converter_used = 'PySpark' if (file_ext == '.csv' and 'Using PySpark' in result.stdout) else 'Standard'\n            print(f\"   ‚úÖ Success ({duration}s) - {converter_used} converter - TestPyPI {PYFORGE_VERSION}\")\n            \n            # Log observation\n            observation['status'] = 'SUCCESS'\n            observation['duration'] = f\"{duration}s\"\n            observation['converter'] = converter_used\n            \n            # Verify output file exists in volume\n            try:\n                dbutils.fs.ls(output_path.replace('/Volumes/', 'dbfs:/Volumes/'))\n                print(f\"   ‚úÖ Output file verified in volume\")\n                observation['output_verified'] = True\n            except Exception:\n                print(f\"   ‚ö†Ô∏è  Output file not found in volume\")\n                observation['output_verified'] = False\n                \n        else:\n            status = 'FAILED'\n            error_message = result.stderr.strip() if result.stderr else result.stdout.strip()\n            converter_used = 'Unknown'\n            print(f\"   ‚ùå Failed ({duration}s) - TestPyPI {PYFORGE_VERSION}\")\n            print(f\"   Error: {error_message[:200]}...\")\n            \n            # Log observation\n            observation['status'] = 'FAILED'\n            observation['duration'] = f\"{duration}s\"\n            observation['error'] = error_message[:200]\n        \n        # Print detailed observation\n        print(f\"\\nüìù TestPyPI Test Observation:\")\n        for key, value in observation.items():\n            print(f\"   {key}: {value}\")\n        \n        return {\n            'file_name': file_name,\n            'file_type': file_type,\n            'status': status,\n            'duration_seconds': duration,\n            'error_message': error_message,\n            'output_path': output_path if status == 'SUCCESS' else None,\n            'size_mb': file_size_mb,\n            'command': ' '.join(cmd),\n            'converter_used': converter_used,\n            'testpypi_version': PYFORGE_VERSION,\n            'observation': observation\n        }\n        \n    except subprocess.TimeoutExpired:\n        observation['status'] = 'TIMEOUT'\n        observation['duration'] = f\"{timeout}s\"\n        print(f\"   ‚è∞ Timeout after {timeout}s - TestPyPI {PYFORGE_VERSION}\")\n        \n        return {\n            'file_name': file_name,\n            'file_type': file_type,\n            'status': 'TIMEOUT',\n            'duration_seconds': timeout,\n            'error_message': f'Conversion timed out after {timeout} seconds',\n            'output_path': None,\n            'size_mb': file_size_mb,\n            'command': ' '.join(cmd),\n            'converter_used': 'Unknown',\n            'testpypi_version': PYFORGE_VERSION,\n            'observation': observation\n        }\n    except Exception as e:\n        observation['status'] = 'ERROR'\n        observation['error'] = str(e)\n        print(f\"   üö´ Error: {str(e)} - TestPyPI {PYFORGE_VERSION}\")\n        \n        return {\n            'file_name': file_name,\n            'file_type': file_type,\n            'status': 'ERROR',\n            'duration_seconds': 0,\n            'error_message': str(e),\n            'output_path': None,\n            'size_mb': file_size_mb,\n            'command': ' '.join(cmd),\n            'converter_used': 'Unknown',\n            'testpypi_version': PYFORGE_VERSION,\n            'observation': observation\n        }\n\ndef run_bulk_testpypi_tests():\n    \"\"\"Run conversion tests using TestPyPI version.\"\"\"\n    print(f\"\\nüöÄ Starting TestPyPI conversion tests...\")\n    print(f\"üì¶ PyForge Version: {PYFORGE_VERSION} (TestPyPI)\")\n    print(f\"üìÅ Output directory: {CONVERTED_OUTPUT_PATH}\")\n    print(f\"üìä Test mode: {'Smallest files only' if TEST_SMALLEST_FILES_ONLY else 'All files'}\")\n    print(f\"üîß Force conversion: {FORCE_CONVERSION}\")\n    print(f\"üöÄ Use PySpark for CSV: {USE_PYSPARK_FOR_CSV}\")\n    \n    # Ensure base output directory exists\n    try:\n        dbutils.fs.mkdirs(CONVERTED_OUTPUT_PATH.replace('/Volumes/', 'dbfs:/Volumes/'))\n        print(f\"‚úÖ Created base output directory: {CONVERTED_OUTPUT_PATH}\")\n    except Exception as e:\n        print(f\"‚ö†Ô∏è  Base output directory may already exist: {e}\")\n    \n    test_results = []\n    test_observations = []\n    total_start_time = time.time()\n    \n    for i, file_info in enumerate(files_catalog, 1):\n        print(f\"\\n{'='*60}\")\n        print(f\"üìù TestPyPI Test {i}/{len(files_catalog)}\")\n        result = run_testpypi_conversion_test(file_info)\n        test_results.append(result)\n        test_observations.append(result['observation'])\n    \n    total_end_time = time.time()\n    total_duration = round(total_end_time - total_start_time, 2)\n    \n    # Print test observations summary\n    print(f\"\\n{'='*60}\")\n    print(\"üìä TESTPYPI TEST OBSERVATIONS SUMMARY:\")\n    print(f\"{'='*60}\")\n    print(f\"PyForge Version: {PYFORGE_VERSION} (TestPyPI)\")\n    for obs in test_observations:\n        print(f\"\\n{obs['file']} ({obs['type']}, {obs.get('size', 'Unknown')}):")
        print(f\"   Status: {obs['status']}\")\n        if 'duration' in obs:\n            print(f\"   Duration: {obs.get('duration', 'N/A')}\")\n        if 'converter' in obs:\n            print(f\"   Converter: {obs['converter']}\")\n        if 'reason' in obs:\n            print(f\"   Reason: {obs['reason']}\")\n        if 'error' in obs:\n            print(f\"   Error: {obs['error'][:100]}...\")\n        print(f\"   TestPyPI Version: {obs['testpypi_version']}\")\n    \n    return test_results, total_duration\n\n# Run the bulk conversion tests with TestPyPI version\nprint(\"üéØ Executing TestPyPI conversion tests...\")\ntest_results, total_test_duration = run_bulk_testpypi_tests()\n\nprint(f\"\\nüèÅ TestPyPI conversion testing completed in {total_test_duration} seconds!\")"

In [ ]:
# DBTITLE 1,Select Files for Testing\n# =============================================================================\n# FILE SELECTION FOR TESTING\n# =============================================================================\n\ndef select_files_for_testing(all_files, files_by_type, test_smallest_only=True):\n    \"\"\"Select files for testing based on configuration.\"\"\"\n    selected_files = []\n    \n    if test_smallest_only:\n        print(\"üéØ Selecting SMALLEST file of each type for TestPyPI testing...\")\n        \n        # Get smallest file of each type\n        for file_type in sorted(files_by_type.keys()):\n            if files_by_type[file_type]:\n                smallest_file = files_by_type[file_type][0]  # Already sorted by size\n                selected_files.append(smallest_file)\n                print(f\"   {file_type}: {smallest_file['file_name']} ({smallest_file['size_readable']})\")\n    else:\n        print(\"üìã Selecting ALL files for TestPyPI testing...\")\n        selected_files = all_files\n        print(f\"   Total files selected: {len(selected_files)}\")\n    \n    return selected_files\n\n# Select files based on widget setting\nfiles_for_testing = select_files_for_testing(all_files, files_by_type, TEST_SMALLEST_FILES_ONLY)\n\n# Display selected files\nprint(f\"\\nüìä Files Selected for TestPyPI Testing: {len(files_for_testing)}\")\nif files_for_testing:\n    df_selected = pd.DataFrame(files_for_testing)\n    display(df_selected[['file_type', 'file_name', 'size_readable', 'category', 'file_path']])\n    \n    # Calculate total size and estimated time\n    total_size_mb = sum(f['size_mb'] for f in files_for_testing)\n    estimated_time = len(files_for_testing) * 30  # Assume 30 seconds per file average\n    \n    print(f\"\\nüìà TestPyPI Test Estimation:\")\n    print(f\"   Files to process: {len(files_for_testing)}\")\n    print(f\"   Total data size: {format_file_size(total_size_mb * 1024 * 1024)}\")\n    print(f\"   Estimated time: ~{estimated_time // 60} minutes {estimated_time % 60} seconds\")\n    print(f\"   PyForge version: {PYFORGE_VERSION} (TestPyPI)\")\nelse:\n    print(\"‚ö†Ô∏è  No files selected for testing!\")\n\n# Update files_catalog with selected files\nfiles_catalog = files_for_testing\nprint(f\"\\n‚úÖ File selection completed. {len(files_catalog)} files ready for TestPyPI conversion testing.\")"

In [ ]:
# DBTITLE 1,Discover and Display Downloaded Files\n# =============================================================================\n# FILE DISCOVERY AND DETAILED DISPLAY\n# =============================================================================\n\ndef discover_and_display_files():\n    \"\"\"Discover all downloaded files and display them with size information.\"\"\"\n    print(\"üîç Discovering all downloaded files in sample datasets...\")\n    print(f\"   Using PyForge CLI {PYFORGE_VERSION} from TestPyPI\")\n    \n    all_files = []\n    files_by_type = {}\n    supported_extensions = {\n        '.csv': 'CSV',\n        '.xlsx': 'Excel', \n        '.xls': 'Excel',\n        '.xml': 'XML',\n        '.pdf': 'PDF',\n        '.dbf': 'DBF',\n        '.mdb': 'MDB',\n        '.accdb': 'ACCDB'\n    }\n    \n    try:\n        # Use dbutils to list files in volume\n        def list_files_recursive(path, prefix=\"\"):\n            items = []\n            try:\n                files = dbutils.fs.ls(path)\n                for file_info in files:\n                    if file_info.isDir():\n                        # Recursively list subdirectories\n                        subdir_items = list_files_recursive(file_info.path, prefix + file_info.name + \"/\")\n                        items.extend(subdir_items)\n                    else:\n                        # Add file info\n                        items.append({\n                            'path': file_info.path,\n                            'name': file_info.name,\n                            'size': file_info.size,\n                            'relative_path': prefix + file_info.name\n                        })\n            except Exception as e:\n                print(f\"   Warning: Could not list {path}: {e}\")\n            return items\n        \n        # Get all files from the sample datasets path\n        volume_path = SAMPLE_DATASETS_PATH.replace('/Volumes/', 'dbfs:/Volumes/')\n        all_files_raw = list_files_recursive(volume_path)\n        \n        # Process and categorize files\n        for file_info in all_files_raw:\n            file_name = file_info['name']\n            file_ext = '.' + file_name.split('.')[-1].lower() if '.' in file_name else ''\n            \n            if file_ext in supported_extensions:\n                # Convert dbfs path back to /Volumes/ path\n                file_path = file_info['path'].replace('dbfs:/Volumes/', '/Volumes/')\n                \n                # Get folder category from relative path\n                rel_path_parts = file_info['relative_path'].split('/')\n                folder_category = rel_path_parts[0] if len(rel_path_parts) > 1 else 'root'\n                \n                file_dict = {\n                    'file_name': file_name,\n                    'file_type': supported_extensions[file_ext],\n                    'extension': file_ext,\n                    'category': folder_category,\n                    'file_path': file_path,\n                    'relative_path': file_info['relative_path'],\n                    'size_bytes': file_info['size'],\n                    'size_mb': round(file_info['size'] / (1024*1024), 3) if file_info['size'] > 0 else 0,\n                    'size_readable': format_file_size(file_info['size'])\n                }\n                \n                all_files.append(file_dict)\n                \n                # Group by file type\n                if file_dict['file_type'] not in files_by_type:\n                    files_by_type[file_dict['file_type']] = []\n                files_by_type[file_dict['file_type']].append(file_dict)\n        \n        # Sort files by size within each type\n        for file_type in files_by_type:\n            files_by_type[file_type].sort(key=lambda x: x['size_bytes'])\n            \n    except Exception as e:\n        print(f\"   Error discovering files: {e}\")\n        print(\"   Proceeding with empty file catalog\")\n    \n    return all_files, files_by_type\n\ndef format_file_size(size_bytes):\n    \"\"\"Format file size in human-readable format.\"\"\"\n    for unit in ['B', 'KB', 'MB', 'GB']:\n        if size_bytes < 1024.0:\n            return f\"{size_bytes:.2f} {unit}\"\n        size_bytes /= 1024.0\n    return f\"{size_bytes:.2f} TB\"\n\n# Discover files\nall_files, files_by_type = discover_and_display_files()\n\n# Display summary statistics\nprint(f\"\\nüìä Downloaded Files Summary (TestPyPI Testing):\")\nprint(f\"   Total files found: {len(all_files)}\")\nprint(f\"   Total size: {format_file_size(sum(f['size_bytes'] for f in all_files))}\")\nprint(f\"   File types: {', '.join(sorted(files_by_type.keys()))}\")\nprint(f\"   PyForge version: {PYFORGE_VERSION} (from TestPyPI)\")\n\n# Display files by type\nprint(\"\\nüìã Files by Type (sorted by size):\")\nfor file_type, files in sorted(files_by_type.items()):\n    print(f\"\\n{file_type} Files ({len(files)} files):\")\n    for i, file_info in enumerate(files[:5]):  # Show first 5 files of each type\n        print(f\"   {i+1}. {file_info['file_name']} - {file_info['size_readable']} - {file_info['relative_path']}\")\n    if len(files) > 5:\n        print(f\"   ... and {len(files) - 5} more {file_type} files\")\n\n# Create DataFrame for display\nif all_files:\n    df_all_files = pd.DataFrame(all_files)\n    \n    # Summary by file type\n    print(\"\\nüìä Detailed Summary by File Type:\")\n    summary_by_type = df_all_files.groupby('file_type').agg({\n        'file_name': 'count',\n        'size_mb': ['sum', 'mean', 'min', 'max']\n    }).round(3)\n    summary_by_type.columns = ['file_count', 'total_size_mb', 'avg_size_mb', 'min_size_mb', 'max_size_mb']\n    display(summary_by_type)\n    \n    # Show smallest file of each type\n    print(\"\\nüéØ Smallest File of Each Type (for testing):\")\n    smallest_files = []\n    for file_type in files_by_type:\n        if files_by_type[file_type]:\n            smallest = files_by_type[file_type][0]  # Already sorted by size\n            smallest_files.append(smallest)\n    \n    df_smallest = pd.DataFrame(smallest_files)\n    display(df_smallest[['file_type', 'file_name', 'size_readable', 'category', 'file_path']])\n    \n    # Full file listing - Fixed to sort by columns that are actually displayed\n    print(\"\\nüìÅ Complete File Listing:\")\n    # First sort the DataFrame, then display only selected columns\n    df_sorted = df_all_files.sort_values(['file_type', 'size_bytes'])\n    display(df_sorted[['file_name', 'file_type', 'size_readable', 'category', 'relative_path']])\n    \nelse:\n    print(\"\\n‚ö†Ô∏è  No files found in the sample datasets directory.\")\n    print(\"   Please check if the sample datasets were downloaded successfully.\")\n\n# Store the catalog for later use\nfiles_catalog = all_files\nprint(f\"\\n‚úÖ File discovery completed with TestPyPI version. Found {len(files_catalog)} files ready for testing.\")"

In [ ]:
# DBTITLE 1,Setup Sample Datasets in Volume\n# =============================================================================\n# SAMPLE DATASETS SETUP IN UNITY CATALOG VOLUME\n# =============================================================================\n\nprint(f\"üì• Setting up sample datasets in volume: {SAMPLE_DATASETS_PATH}\")\nprint(f\"   Using PyForge CLI version {PYFORGE_VERSION} from TestPyPI\")\n\n# Create volume directories using dbutils\nvolume_datasets_path = SAMPLE_DATASETS_PATH.replace('/Volumes/', 'dbfs:/Volumes/')\nvolume_output_path = CONVERTED_OUTPUT_PATH.replace('/Volumes/', 'dbfs:/Volumes/')\n\ntry:\n    # Create sample datasets directory\n    dbutils.fs.mkdirs(volume_datasets_path)\n    print(f\"‚úÖ Created sample datasets directory: {SAMPLE_DATASETS_PATH}\")\n    \n    # Create output directory (with testpypi suffix)\n    dbutils.fs.mkdirs(volume_output_path)\n    print(f\"‚úÖ Created output directory: {CONVERTED_OUTPUT_PATH}\")\n    \nexcept Exception as e:\n    print(f\"‚ö†Ô∏è  Directory creation warning: {e}\")\n    print(\"   Directories may already exist\")\n\n# Install sample datasets using PyForge CLI from TestPyPI\nprint(\"\\nüì¶ Installing sample datasets using PyForge CLI (TestPyPI version)...\")\ntry:\n    # Use shell command to install sample datasets to volume path\n    result = subprocess.run([\n        'pyforge', 'install', 'sample-datasets', SAMPLE_DATASETS_PATH, '--force'\n    ], capture_output=True, text=True, timeout=300)\n    \n    if result.returncode == 0:\n        print(\"‚úÖ Sample datasets installed successfully using TestPyPI version!\")\n        print(f\"   Output: {result.stdout}\")\n    else:\n        print(f\"‚ö†Ô∏è  Sample datasets installation had issues: {result.stderr}\")\n        print(\"   Proceeding with available data...\")\n        \nexcept subprocess.TimeoutExpired:\n    print(\"‚ö†Ô∏è  Sample datasets installation timed out, creating minimal test datasets...\")\nexcept Exception as e:\n    print(f\"‚ö†Ô∏è  Sample datasets installation failed: {e}\")\n    print(\"   Creating minimal test datasets in volume...\")\n\n# Create minimal test datasets directly in volume if needed\ntry:\n    # Create test CSV file in volume\n    test_csv_data = \"\"\"id,name,category,value,date\n1,Sample Item 1,Category A,100.50,2023-01-01\n2,Sample Item 2,Category B,250.75,2023-01-02\n3,Sample Item 3,Category A,175.25,2023-01-03\n4,Sample Item 4,Category C,90.00,2023-01-04\n5,Sample Item 5,Category B,320.80,2023-01-05\"\"\"\n    \n    csv_path = f\"{SAMPLE_DATASETS_PATH}/csv/test_data.csv\"\n    dbutils.fs.mkdirs(f\"{volume_datasets_path}/csv\")\n    dbutils.fs.put(csv_path.replace('/Volumes/', 'dbfs:/Volumes/'), test_csv_data, overwrite=True)\n    print(f\"‚úÖ Created test CSV file: {csv_path}\")\n    \n    # Create test XML file in volume\n    test_xml_data = \"\"\"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<data>\n    <items>\n        <item id=\"1\">\n            <name>Sample Item 1</name>\n            <category>Category A</category>\n            <value>100.50</value>\n            <date>2023-01-01</date>\n        </item>\n        <item id=\"2\">\n            <name>Sample Item 2</name>\n            <category>Category B</category>\n            <value>250.75</value>\n            <date>2023-01-02</date>\n        </item>\n    </items>\n</data>\"\"\"\n    \n    xml_path = f\"{SAMPLE_DATASETS_PATH}/xml/test_data.xml\"\n    dbutils.fs.mkdirs(f\"{volume_datasets_path}/xml\")\n    dbutils.fs.put(xml_path.replace('/Volumes/', 'dbfs:/Volumes/'), test_xml_data, overwrite=True)\n    print(f\"‚úÖ Created test XML file: {xml_path}\")\n    \nexcept Exception as e:\n    print(f\"‚ö†Ô∏è  Error creating test files: {e}\")\n\nprint(\"\\n‚úÖ Sample datasets setup completed with TestPyPI version!\")\nprint(f\"üîç Ready to test PyForge CLI {PYFORGE_VERSION} development features!\")"

# PyForge CLI End-to-End Testing - Databricks Serverless (TestPyPI)

This notebook tests PyForge CLI functionality in Databricks Serverless environment by installing directly from TestPyPI repository.

## Databricks Widgets
This notebook uses Databricks widgets for easy parameter configuration. The widgets will appear at the top of the notebook after running the first cell:

- **sample_datasets_base_path**: Base path for sample datasets installation
  - Default: `/Volumes/cortex_dev_catalog/0000_santosh/volume_sandbox/sample-datasets/`
  - Type: Text input widget
  
- **pyforge_version**: PyForge CLI version to test from TestPyPI
  - Default: `1.0.8.dev8`
  - Type: Text input widget
  
- **databricks_username**: Your Databricks username
  - Default: `usa-sdandey@deloitte.com`
  - Type: Text input widget
  
- **force_conversion**: Whether to force overwrite existing conversions
  - Default: `True`
  - Type: Dropdown (True/False)
  
- **use_pyspark_for_csv**: Enable PySpark converter for CSV files
  - Default: `True`
  - Type: Dropdown (True/False)
  
- **test_smallest_files_only**: Test only the smallest file of each type
  - Default: `True`
  - Type: Dropdown (True/False)

## Test Configuration
- **Environment**: Databricks Serverless Compute
- **Installation Source**: TestPyPI Repository (https://test.pypi.org/simple/)
- **Sample Data**: Real sample datasets from v1.0.5 release
- **Output Format**: Parquet (optimized for Databricks)

## Prerequisites
1. TestPyPI deployment of PyForge CLI (automatically deployed from main branch)
2. Unity Catalog access permissions to the specified volume path
3. Workspace access to CoreDataEngineers folder

## ‚ö†Ô∏è Important: TestPyPI Installation Configuration
**All `%pip install` commands in this notebook use TestPyPI with fallback to PyPI:**

```python
%pip install package --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ --no-cache-dir --quiet
```

**Required flags:**
- `--index-url https://test.pypi.org/simple/`: Primary index for PyForge CLI development versions
- `--extra-index-url https://pypi.org/simple/`: Fallback for dependencies not on TestPyPI
- `--no-cache-dir`: Ensures fresh installation without cached packages
- `--quiet`: Reduces installation output verbosity

## How to Use This Notebook
1. Run the first cell to initialize the widgets
2. Modify widget values as needed (they appear at the top of the notebook)
3. Run all remaining cells in sequence
4. Review the test results and summary report

## Key Features of This Notebook
1. **TestPyPI Integration**: Tests latest development versions before production release
2. **Improved File Discovery**: Displays all downloaded files with sizes using `dbutils.fs.ls`
3. **Smart File Selection**: Option to test only smallest files or all files
4. **Detailed Observations**: Logs detailed test observations for each conversion
5. **Better Error Handling**: Enhanced error messages and timeout management
6. **Development Version Testing**: Validates pre-release functionality

In [None]:
# =============================================================================
# DATABRICKS WIDGETS INITIALIZATION
# =============================================================================

# Remove any existing widgets to ensure clean state
dbutils.widgets.removeAll()

# Create widgets for notebook parameters
dbutils.widgets.text(
    "sample_datasets_base_path", 
    "/Volumes/cortex_dev_catalog/0000_santosh/volume_sandbox/sample-datasets/",
    "Sample Datasets Base Path"
)

dbutils.widgets.text(
    "pyforge_version",
    "1.0.8.dev8",
    "PyForge Version (TestPyPI)"
)

dbutils.widgets.text(
    "databricks_username",
    "usa-sdandey@deloitte.com",
    "Databricks Username"
)

dbutils.widgets.dropdown(
    "force_conversion",
    "True",
    ["True", "False"],
    "Force Conversion"
)

dbutils.widgets.dropdown(
    "use_pyspark_for_csv",
    "True", 
    ["True", "False"],
    "Use PySpark for CSV"
)

dbutils.widgets.dropdown(
    "test_smallest_files_only",
    "True",
    ["True", "False"],
    "Test Smallest Files Only"
)

# Display widget values
print("üìã Widget Parameters Initialized:")
print(f"   Sample Datasets Base Path: {dbutils.widgets.get('sample_datasets_base_path')}")
print(f"   PyForge Version (TestPyPI): {dbutils.widgets.get('pyforge_version')}")
print(f"   Databricks Username: {dbutils.widgets.get('databricks_username')}")
print(f"   Force Conversion: {dbutils.widgets.get('force_conversion')}")
print(f"   Use PySpark for CSV: {dbutils.widgets.get('use_pyspark_for_csv')}")
print(f"   Test Smallest Files Only: {dbutils.widgets.get('test_smallest_files_only')}")

print("\n‚úÖ Widgets created successfully! You can modify the parameters using the widgets above.")
print("üìù Note: Widget values will persist across cell executions until changed.")
print("üîç This notebook will install PyForge CLI from TestPyPI for development testing.")

In [None]:
# =============================================================================
# CONFIGURATION SECTION - Using Widget Values
# =============================================================================

# Get widget values
SAMPLE_DATASETS_BASE_PATH = dbutils.widgets.get("sample_datasets_base_path")
PYFORGE_VERSION = dbutils.widgets.get("pyforge_version")
DATABRICKS_USERNAME = dbutils.widgets.get("databricks_username")
FORCE_CONVERSION = dbutils.widgets.get("force_conversion").lower() == "true"
USE_PYSPARK_FOR_CSV = dbutils.widgets.get("use_pyspark_for_csv").lower() == "true"
TEST_SMALLEST_FILES_ONLY = dbutils.widgets.get("test_smallest_files_only").lower() == "true"

# Derived paths
SAMPLE_DATASETS_PATH = SAMPLE_DATASETS_BASE_PATH.rstrip('/')  # Remove trailing slash for consistency
CONVERTED_OUTPUT_PATH = SAMPLE_DATASETS_PATH.replace('/sample-datasets', '/converted_output_testpypi')

# TestPyPI configuration
TESTPYPI_URL = "https://test.pypi.org/simple/"
PYPI_FALLBACK_URL = "https://pypi.org/simple/"

print(f"üîß Configuration (from widgets):")
print(f"   PyForge Version (TestPyPI): {PYFORGE_VERSION}")
print(f"   Databricks Username: {DATABRICKS_USERNAME}")
print(f"   Sample Datasets Base Path: {SAMPLE_DATASETS_BASE_PATH}")
print(f"   Sample Datasets Path: {SAMPLE_DATASETS_PATH}")
print(f"   Output Path: {CONVERTED_OUTPUT_PATH}")
print(f"   Force Conversion: {FORCE_CONVERSION}")
print(f"   Use PySpark for CSV: {USE_PYSPARK_FOR_CSV}")
print(f"   Test Smallest Files Only: {TEST_SMALLEST_FILES_ONLY}")

print(f"\nüì¶ TestPyPI Configuration:")
print(f"   Primary Index: {TESTPYPI_URL}")
print(f"   Fallback Index: {PYPI_FALLBACK_URL}")
print(f"   Installation Command: pyforge-cli=={PYFORGE_VERSION}")

# Validate paths
if not SAMPLE_DATASETS_BASE_PATH.startswith("/Volumes/"):
    print("‚ö†Ô∏è  Warning: Sample datasets path should start with /Volumes/ for Unity Catalog volumes")

print("\nüìù Tip: You can change these values using the widgets at the top of the notebook!")
print("üöÄ This notebook will test development versions from TestPyPI before they're released to production.")

### TestPyPI Development Testing

This notebook installs PyForge CLI from **TestPyPI** which contains development versions that are automatically deployed from the main branch.

**Benefits of TestPyPI Testing:**
- üöÄ **Early Access**: Test latest features before production release
- üîç **Quality Assurance**: Validate development versions in real environment
- üõ†Ô∏è **Issue Detection**: Catch problems before they reach production PyPI
- üìà **Continuous Integration**: Part of the automated deployment pipeline

**Version Format**: Development versions follow the pattern `1.0.X.devN` where:
- `1.0.X` is the base version
- `devN` indicates the development iteration (auto-incremented)

**Deployment Pipeline**:
```
Code Push ‚Üí GitHub Actions ‚Üí Build ‚Üí TestPyPI ‚Üí This Notebook ‚Üí Production PyPI
```

In [None]:
# =============================================================================
# WIDGET PARAMETER VALIDATION
# =============================================================================

# Validate widget parameters before proceeding
validation_errors = []

# Check sample datasets path
if not SAMPLE_DATASETS_BASE_PATH:
    validation_errors.append("‚ùå Sample datasets base path cannot be empty")
elif not SAMPLE_DATASETS_BASE_PATH.startswith("/Volumes/"):
    validation_errors.append("‚ö†Ô∏è  Sample datasets path should start with /Volumes/ for Unity Catalog volumes")

# Check PyForge version format
if not PYFORGE_VERSION:
    validation_errors.append("‚ùå PyForge version cannot be empty")
elif not any(char.isdigit() for char in PYFORGE_VERSION):
    validation_errors.append("‚ùå PyForge version should contain version numbers")
elif "dev" not in PYFORGE_VERSION:
    validation_errors.append("‚ö†Ô∏è  Expected development version format (e.g., 1.0.8.dev8)")

# Check username
if not DATABRICKS_USERNAME:
    validation_errors.append("‚ùå Databricks username cannot be empty")
elif "@" not in DATABRICKS_USERNAME and "-" not in DATABRICKS_USERNAME:
    validation_errors.append("‚ö†Ô∏è  Username format may be incorrect (expected email or ID format)")

# Display validation results
if validation_errors:
    print("‚ö†Ô∏è  PARAMETER VALIDATION WARNINGS:")
    for error in validation_errors:
        print(f"   {error}")
    print("\nüìù Please review the widget parameters above and update if needed.")
    
    # For critical errors, stop execution
    critical_errors = [e for e in validation_errors if e.startswith("‚ùå")]
    if critical_errors:
        raise ValueError(f"Critical validation errors found: {critical_errors}")
else:
    print("‚úÖ All widget parameters validated successfully!")
    
print(f"\nüì¶ Will install PyForge CLI {PYFORGE_VERSION} from TestPyPI")
print(f"üîó TestPyPI URL: {TESTPYPI_URL}")
print(f"üíæ Output will be saved to: {CONVERTED_OUTPUT_PATH}")
print("\nüöÄ Ready to proceed with TestPyPI installation!")

In [None]:
# =============================================================================
# ENVIRONMENT VERIFICATION
# =============================================================================

import os
import subprocess
import json
from datetime import datetime

print("üîç Verifying Databricks Serverless environment...")

# Check if we're in Databricks environment
try:
    dbutils
    print("‚úÖ Running in Databricks environment")
    
    # Get current user info
    current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
    print(f"   Current user: {current_user}")
    
    # Verify Unity Catalog volume access
    try:
        volume_root = SAMPLE_DATASETS_BASE_PATH.split('/')[:-1]  # Remove filename, keep directory
        volume_path = '/'.join(volume_root) + '/'
        dbutils.fs.ls(volume_path.replace('/Volumes/', 'dbfs:/Volumes/'))
        print(f"‚úÖ Unity Catalog volume access confirmed: {volume_path}")
    except Exception as e:
        print(f"‚ö†Ô∏è  Unity Catalog volume access warning: {e}")
        print(f"   Will attempt to create directories during setup")
        
except NameError:
    print("‚ùå Not running in Databricks environment")
    print("   This notebook is designed for Databricks Serverless only")
    raise RuntimeError("This notebook requires Databricks environment")

print(f"\nüïê TestPyPI testing started at: {datetime.now()}")
print(f"üì¶ Target PyForge version: {PYFORGE_VERSION}")
print(f"üîó Installation source: TestPyPI (development repository)")

In [None]:
# =============================================================================
# INSTALLATION FROM TESTPYPI WITH FALLBACK TO PYPI
# =============================================================================

print(f"üì¶ Installing PyForge CLI from TestPyPI...")
print(f"   Target version: {PYFORGE_VERSION}")
print(f"   Primary source: {TESTPYPI_URL}")
print(f"   Fallback source: {PYPI_FALLBACK_URL}")
print(f"   Using --no-cache-dir to ensure fresh installation")

# Install PyForge CLI from TestPyPI with fallback to PyPI for dependencies
%pip install pyforge-cli=={PYFORGE_VERSION} --index-url {TESTPYPI_URL} --extra-index-url {PYPI_FALLBACK_URL} --no-cache-dir --quiet

print(f"‚úÖ PyForge CLI {PYFORGE_VERSION} installed successfully from TestPyPI!")
print("üîÑ Restarting Python environment to ensure clean import...")

# Display installation info
print(f"\nüìã Installation Summary:")
print(f"   Package: pyforge-cli=={PYFORGE_VERSION}")
print(f"   Source: TestPyPI (development repository)")
print(f"   Dependencies: Resolved from PyPI fallback")
print(f"   Environment: Databricks Serverless")
print(f"   Cache: Disabled for fresh installation")

In [None]:
# Restart Python to ensure clean environment
dbutils.library.restartPython()

In [None]:
# =============================================================================
# VARIABLE RE-INITIALIZATION AFTER PYTHON RESTART
# =============================================================================

# Re-initialize all configuration variables from widgets since Python was restarted
# Widgets persist across Python restarts, so we can get the values again

# Get widget values
SAMPLE_DATASETS_BASE_PATH = dbutils.widgets.get("sample_datasets_base_path")
PYFORGE_VERSION = dbutils.widgets.get("pyforge_version")
DATABRICKS_USERNAME = dbutils.widgets.get("databricks_username")
FORCE_CONVERSION = dbutils.widgets.get("force_conversion").lower() == "true"
USE_PYSPARK_FOR_CSV = dbutils.widgets.get("use_pyspark_for_csv").lower() == "true"
TEST_SMALLEST_FILES_ONLY = dbutils.widgets.get("test_smallest_files_only").lower() == "true"

# Derived paths
SAMPLE_DATASETS_PATH = SAMPLE_DATASETS_BASE_PATH.rstrip('/')  # Remove trailing slash for consistency
CONVERTED_OUTPUT_PATH = SAMPLE_DATASETS_PATH.replace('/sample-datasets', '/converted_output_testpypi')

# TestPyPI configuration
TESTPYPI_URL = "https://test.pypi.org/simple/"
PYPI_FALLBACK_URL = "https://pypi.org/simple/"

print(f"üîÑ Re-initialized configuration variables from widgets after Python restart:")
print(f"   PyForge Version (TestPyPI): {PYFORGE_VERSION}")
print(f"   Databricks Username: {DATABRICKS_USERNAME}")
print(f"   Sample Datasets Base Path: {SAMPLE_DATASETS_BASE_PATH}")
print(f"   Sample Datasets Path: {SAMPLE_DATASETS_PATH}")
print(f"   Output Path: {CONVERTED_OUTPUT_PATH}")
print(f"   Force Conversion: {FORCE_CONVERSION}")
print(f"   Use PySpark for CSV: {USE_PYSPARK_FOR_CSV}")
print(f"   Test Smallest Files Only: {TEST_SMALLEST_FILES_ONLY}")

print("\n‚úÖ Configuration restored from widgets successfully!")
print(f"üöÄ Ready to test PyForge CLI {PYFORGE_VERSION} from TestPyPI!")

In [None]:
# =============================================================================
# VERIFICATION SECTION
# =============================================================================

import subprocess
import time
import os
import pandas as pd
from datetime import datetime
import json

print("üîç Verifying PyForge CLI installation from TestPyPI...")

# Verify PyForge installation
try:
    import pyforge_cli
    print(f"‚úÖ PyForge CLI module imported successfully")
    print(f"   Module location: {pyforge_cli.__file__}")
    print(f"   Installed version: {pyforge_cli.__version__}")
    
    # Verify the version matches what we expected
    if pyforge_cli.__version__ == PYFORGE_VERSION:
        print(f"‚úÖ Version verification: {pyforge_cli.__version__} matches expected {PYFORGE_VERSION}")
    else:
        print(f"‚ö†Ô∏è  Version mismatch: installed {pyforge_cli.__version__}, expected {PYFORGE_VERSION}")
        print(f"   This might indicate a caching issue or different available version on TestPyPI")
        
except ImportError as e:
    print(f"‚ùå Failed to import PyForge CLI: {e}")
    print("   Try resetting the environment from the Environment panel")
    raise

print(f"\nüì¶ TestPyPI Installation Successful!")
print(f"   Installed from: TestPyPI (development repository)")
print(f"   Version: {pyforge_cli.__version__}")
print(f"   Dependencies: Resolved from PyPI fallback")

In [None]:
%%sh
echo "üìã PyForge CLI Help Information (TestPyPI Version):"
pyforge --help

In [None]:
%%sh
echo "üìä PyForge CLI Version Information (TestPyPI):"
pyforge --version

In [ ]:
# DBTITLE 1,Check PySpark Availability in Serverless\n# =============================================================================\n# PYSPARK AVAILABILITY CHECK FOR SERVERLESS\n# =============================================================================\n\ndef check_pyspark_availability():\n    \"\"\"Check if PySpark is available in the Databricks Serverless environment.\"\"\"\n    try:\n        import pyspark\n        from pyspark.sql import SparkSession\n        print(\"‚úÖ PySpark is available in this Databricks Serverless environment\")\n        print(f\"   PySpark Version: {pyspark.__version__}\")\n        \n        # Try to get or create a Spark session\n        try:\n            spark = SparkSession.builder.getOrCreate()\n            print(f\"   Spark Session: Active\")\n            print(f\"   Spark Version: {spark.version}\")\n            \n            # Check if it's Spark Connect (serverless)\n            try:\n                master = spark.sparkContext.master\n                print(f\"   Spark Master: {master}\")\n            except Exception:\n                print(f\"   Spark Mode: Serverless (Spark Connect)\")\n            \n            return True\n        except Exception as e:\n            print(f\"   ‚ö†Ô∏è  Could not create Spark session: {e}\")\n            return False\n    except ImportError:\n        print(\"‚ùå PySpark is NOT available in this environment\")\n        print(\"   CSV files will be converted using pandas\")\n        return False\n\n# Check PySpark availability\npyspark_available = check_pyspark_availability()\n\n# Update USE_PYSPARK_FOR_CSV based on availability\nif not pyspark_available and USE_PYSPARK_FOR_CSV:\n    print(\"\\n‚ö†Ô∏è  Note: PySpark not available, CSV conversion will fall back to pandas\")\n    USE_PYSPARK_FOR_CSV = False\nelif pyspark_available:\n    print(\"\\nüöÄ PySpark is available! PyForge CLI will auto-detect and use PySpark for CSV conversions\")\n    print(\"   This is perfect for testing TestPyPI versions with PySpark integration!\")"