In [1]:
# Bismillah 
# ============================================================================
# Cell 1: Imports and Setup
# ============================================================================
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path

# Set to True to see detailed debug information
DEBUG = True

  from pandas.core import (


In [2]:
# ============================================================================
# Cell 2: Core Geometry Validation Functions
# ============================================================================

def extract_points_from_geo(geo_content: str) -> list:
    """
    Extract point coordinates from a .geo file for validation.
    Returns list of [x, y, z] coordinates.
    """
    # First, extract variable definitions
    variables = {}
    var_pattern = r'(\w+)\s*=\s*([\d.]+)\s*;'
    for match in re.finditer(var_pattern, geo_content):
        var_name = match.group(1)
        var_value = float(match.group(2))
        variables[var_name] = var_value
        if DEBUG:
            print(f"    Variable {var_name} = {var_value}")
    
    # Common variable mappings
    if 'L' in variables and 'L' not in ['Line', 'Loop']:
        variables['l'] = variables['L']
    if 'W' in variables:
        variables['w'] = variables['W']
    if 'H' in variables:
        variables['h'] = variables['H']
    
    # Extract points
    point_pattern = r'Point\s*\(\s*(\d+)\s*\)\s*=\s*\{([^}]+)\}'
    points = []
    
    for match in re.finditer(point_pattern, geo_content, re.IGNORECASE):
        point_id = match.group(1)
        coords_str = match.group(2)
        try:
            coords_parts = coords_str.split(',')
            if len(coords_parts) >= 3:
                coords = []
                for i in range(3):
                    coord_expr = coords_parts[i].strip()
                    # Try to evaluate the expression
                    try:
                        # Replace variables with values
                        for var, val in variables.items():
                            coord_expr = coord_expr.replace(var, str(val))
                        # Evaluate simple expressions
                        if '/' in coord_expr or '*' in coord_expr or '-' in coord_expr or '+' in coord_expr:
                            coord_value = eval(coord_expr, {"__builtins__": {}}, {"sqrt": lambda x: x**0.5})
                        else:
                            coord_value = float(coord_expr)
                        coords.append(coord_value)
                    except:
                        # If evaluation fails, try to parse as float
                        coords.append(float(coord_expr))
                
                if len(coords) == 3:
                    points.append(coords)
                    if DEBUG:
                        print(f"    Point {point_id}: {coords}")
        except Exception as e:
            if DEBUG:
                print(f"    Failed to parse point {point_id}: {e}")
            continue
    
    return points


def validate_square_bar_geometry(geo_path: str) -> dict:
    """
    Validates square bar geometry by checking:
    1. Structural completeness (points, lines, surfaces, volume)
    2. Dimensional accuracy (10cm x 1cm x 1cm)
    3. Geometric relationships
    """
    try:
        with open(geo_path, 'r') as f:
            content = f.read()
        if DEBUG:
            print(f"  Successfully read {len(content)} characters from {geo_path}")
    except Exception as e:
        if DEBUG:
            print(f"  Failed to read {geo_path}: {e}")
        return {
            'file_readable': False,
            'score': 0.0
        }
    
    validation = {
        'file_readable': True,
        'structural_elements': {},
        'dimensions': {'status': '✗ Not checked'},  # Initialize with default
        'issues': [],
        'score': 0.0,
        'structural_score': 0.0,  # Initialize
        'dimension_score': 0.0,   # Initialize
        'category': 'Poor'        # Default category
    }
    
    # Check structural elements
    elements = {
        'points': content.count('Point('),
        'lines': content.count('Line('),
        'surfaces': content.count('Surface(') + content.count('Plane Surface('),
        'volumes': content.count('Volume(')
    }
    
    expected = {
        'points': 8,    # 8 corners of a box
        'lines': 12,    # 12 edges of a box
        'surfaces': 6,  # 6 faces of a box
        'volumes': 1    # Single volume
    }
    
    # Calculate structural completeness
    structural_score = 0
    for elem, count in elements.items():
        expected_count = expected[elem]
        if count >= expected_count:
            structural_score += 0.25
            validation['structural_elements'][elem] = f"✓ {count}/{expected_count}"
        else:
            structural_score += 0.25 * (count / expected_count)
            validation['structural_elements'][elem] = f"✗ {count}/{expected_count}"
            validation['issues'].append(f"Missing {expected_count - count} {elem}")
    
    # Check dimensions if we have enough points
    dimension_score = 0
    if elements['points'] >= 8:
        points = extract_points_from_geo(content)
        
        # ADD DEBUG OUTPUT HERE
        if DEBUG:
            print(f"  Found {len(points)} points for {geo_path}")
            if points:
                print(f"  First few points: {points[:3]}")
        
        if len(points) >= 8:
            # Calculate bounding box
            points_array = np.array(points)
            min_coords = points_array.min(axis=0)
            max_coords = points_array.max(axis=0)
            dimensions = max_coords - min_coords
            
            # ADD MORE DEBUG OUTPUT HERE
            if DEBUG:
                print(f"  Min coords: {min_coords}")
                print(f"  Max coords: {max_coords}")
                print(f"  Dimensions: {dimensions}")
            
            # Sort dimensions to match expected 10x1x1
            sorted_dims = sorted(dimensions)
            expected_dims = [1, 1, 10]  # cm
            
            # Alternative: might be in different units
            if max(sorted_dims) < 2:  # Likely in meters
                sorted_dims = [d * 100 for d in sorted_dims]  # Convert to cm
            
            # Check with 10% tolerance
            dim_matches = []
            for actual, expected in zip(sorted_dims, expected_dims):
                if expected > 0:
                    error = abs(actual - expected) / expected
                    dim_matches.append(error < 0.1)
                else:
                    dim_matches.append(actual < 0.1)
            
            if all(dim_matches):
                dimension_score = 1.0
                validation['dimensions']['status'] = f"✓ Correct ({sorted_dims[0]:.1f} x {sorted_dims[1]:.1f} x {sorted_dims[2]:.1f} cm)"
            else:
                dimension_score = sum(dim_matches) / 3
                validation['dimensions']['status'] = f"✗ Got {sorted_dims[0]:.1f} x {sorted_dims[1]:.1f} x {sorted_dims[2]:.1f} cm"
                validation['issues'].append("Incorrect dimensions")
    else:
        validation['dimensions']['status'] = "✗ Cannot check - insufficient points"
        validation['issues'].append("Too few points to verify dimensions")
    
    # Overall score with clear weights
    validation['structural_score'] = structural_score
    validation['dimension_score'] = dimension_score
    validation['score'] = 0.4 * structural_score + 0.6 * dimension_score
    
    # Categorize result
    if validation['score'] >= 0.9:
        validation['category'] = 'Excellent'
    elif validation['score'] >= 0.7:
        validation['category'] = 'Good'
    elif validation['score'] >= 0.5:
        validation['category'] = 'Fair'
    else:
        validation['category'] = 'Poor'
    
    return validation


def validate_wheel_axle_geometry(geo_path: str) -> dict:
    """
    Validates wheel & axle assembly geometry.
    Key requirements:
    1. Multiple cylinders (2 wheels + 1 axle)
    2. Boolean operations to merge components
    3. Single physical volume for BC application
    """
    try:
        with open(geo_path, 'r') as f:
            content = f.read()
        if DEBUG:
            print(f"  Successfully read {len(content)} characters from {geo_path}")
    except Exception as e:
        if DEBUG:
            print(f"  Failed to read {geo_path}: {e}")
        return {
            'file_readable': False,
            'score': 0.0
        }
    
    validation = {
        'file_readable': True,
        'components': {},
        'assembly': {},
        'issues': [],
        'score': 0.0,
        'component_score': 0.0,
        'assembly_score': 0.0,
        'category': 'Poor',
        'note': ''
    }
    
    # Check for cylinder components
    cylinder_pattern = r'Cylinder\s*\(\s*\d+\s*\)\s*=\s*\{([^}]+)\}'
    cylinder_matches = list(re.finditer(cylinder_pattern, content, re.IGNORECASE))

    if DEBUG:
        print(f"  Found {len(cylinder_matches)} cylinder definitions")

    # Analyze cylinders to identify wheels vs axle
    wheel_count = 0
    axle_count = 0

    for match in cylinder_matches:
        params = match.group(1)
        if DEBUG:
            print(f"  Cylinder params: {params}")
        # Try to parse radius from cylinder parameters
        try:
            values = [x.strip() for x in params.split(',')]
            if len(values) >= 7:
                # Try to evaluate the radius parameter (index 6)
                radius_expr = values[6]
                
                # ADD THIS EXPANDED VARIABLE HANDLING HERE:
                # Handle variable references - check common naming patterns
                if radius_expr.lower() in ['r_wheel', 'rw', 'wheelradius', 'wheel_radius']:
                    radius = 5.0  # Known wheel radius
                    wheel_count += 1
                elif radius_expr.lower() in ['r_axle', 'ra', 'axleradius', 'axle_radius']:
                    radius = 1.0  # Known axle radius
                    axle_count += 1
                else:
                    # Try to parse as float
                    try:
                        radius = float(radius_expr)
                        # Classify by radius
                        if 3 < radius < 7:
                            wheel_count += 1
                        elif 0.5 < radius < 2:
                            axle_count += 1
                    except ValueError:
                        # If it's still a variable, check the content for variable definitions
                        var_pattern = rf'{radius_expr}\s*=\s*([\d.]+)'
                        var_match = re.search(var_pattern, content)
                        if var_match:
                            radius = float(var_match.group(1))
                            if 3 < radius < 7:
                                wheel_count += 1
                            elif 0.5 < radius < 2:
                                axle_count += 1
                        else:
                            if DEBUG:
                                print(f"    Could not resolve variable: {radius_expr}")
                            continue
                            
                if DEBUG:
                    print(f"    Radius: {radius} -> {'wheel' if radius > 2 else 'axle'}")
        except Exception as e:
            if DEBUG:
                print(f"    Failed to parse cylinder: {e}")
            continue
    
    # Component scoring
    component_score = 0
    if wheel_count >= 2:
        component_score += 0.4
        validation['components']['wheels'] = f"✓ {wheel_count} wheels found"
    else:
        validation['components']['wheels'] = f"✗ Only {wheel_count} wheels found"
        validation['issues'].append(f"Need 2 wheels, found {wheel_count}")
    
    if axle_count >= 1:
        component_score += 0.3
        validation['components']['axle'] = f"✓ {axle_count} axle(s) found"
    else:
        validation['components']['axle'] = "✗ No axle found"
        validation['issues'].append("Missing axle component")
    
    # Check for volumes
    volume_count = content.count('Volume(')
    if volume_count >= 1:
        component_score += 0.3
        validation['components']['volumes'] = f"✓ {volume_count} volume(s)"
    else:
        validation['components']['volumes'] = "✗ No volumes defined"
        validation['issues'].append("No volumes defined")
    
    # Assembly scoring (Boolean operations and physical volume)
    assembly_score = 0
    
    # Enhanced Boolean operation validation
    has_boolean = 'BooleanUnion' in content or 'BooleanDifference' in content
    
    if has_boolean:
        # Find all BooleanUnion operations
        boolean_pattern = r'BooleanUnion\s*\{\s*Volume\s*\{([^}]+)\}[^}]*\}\s*\{\s*Volume\s*\{([^}]+)\}[^}]*\}'
        boolean_ops = re.findall(boolean_pattern, content)
        
        if len(boolean_ops) > 0:
            # Check for the GPT-4o error: Volume{1} appearing in multiple operations
            volume_1_count = 0
            for op in boolean_ops:
                if '1' in op[0]:  # Volume{1} in first part of operation
                    volume_1_count += 1
            
            if volume_1_count > 1:
                # This is the GPT-4o error pattern
                assembly_score += 0.2  # Partial credit only
                validation['assembly']['boolean_ops'] = "⚠ Boolean present but fails (Volume{1} reused)"
                validation['issues'].append("Boolean will fail: Volume{1} reused after deletion")
                validation['note'] = "Boolean operations fail in Gmsh"
            else:
                # Boolean operations appear correct
                assembly_score += 0.6  # Full credit
                validation['assembly']['boolean_ops'] = "✓ Boolean operations present"
        else:
            # Has Boolean keyword but couldn't parse operations
            assembly_score += 0.1
            validation['assembly']['boolean_ops'] = "⚠ Boolean syntax unclear"
            validation['issues'].append("Boolean operations present but syntax unclear")
    else:
        validation['assembly']['boolean_ops'] = "✗ No Boolean operations"
        validation['issues'].append("No Boolean operations - components not merged")
    
    # Check for physical volume
    phys_vol_count = content.count('Physical Volume')
    if phys_vol_count == 1:
        assembly_score += 0.4
        validation['assembly']['physical_volume'] = "✓ Single physical volume"
    elif phys_vol_count > 1:
        assembly_score += 0.2
        validation['assembly']['physical_volume'] = f"⚠ {phys_vol_count} physical volumes"
        validation['issues'].append("Multiple physical volumes - should be merged")
    else:
        validation['assembly']['physical_volume'] = "✗ No physical volume"
        validation['issues'].append("No physical volume defined")
    
    # Overall scoring
    validation['component_score'] = component_score
    validation['assembly_score'] = assembly_score
    validation['score'] = 0.5 * component_score + 0.5 * assembly_score
    
    # Categorize
    if validation['score'] >= 0.9:
        validation['category'] = 'Excellent'
    elif validation['score'] >= 0.7:
        validation['category'] = 'Good'
    elif validation['score'] >= 0.5:
        validation['category'] = 'Fair'
    else:
        validation['category'] = 'Poor'
    
    # Special notes for specific cases
    if has_boolean and 'Volume{1} reused' in str(validation['issues']):
        # GPT-4o case: has Boolean but they fail
        if validation['score'] > 0.5:
            validation['category'] = 'Fair*'
    elif not has_boolean and validation['score'] > 0.5:
        # Good components but no merge
        validation['category'] = 'Good*'
        validation['note'] = "Geometry incomplete without Boolean operations"
    
    return validation

In [3]:
# ============================================================================
# Cell 3: Process All LLMs and Generate Results
# ============================================================================

def evaluate_all_geometries():
    """
    Evaluate all LLM geometries and create publication-ready results
    """
    
    # Define test cases
    square_bar_cases = [
        ("PHI-3 Mini", "PHI_3_MINI/square_bar_phi3.geo"),
        ("Mixtral 8X22B", "MIXTRAL_8X22B/square_bar_mixtral_8x22B_updated.geo"),
        ("Mixtral 8X7B", "MIXTRAL_8X7B/square_bar_mixtral_8x7b_updated.geo"),
        ("LLaMA-3-70B", "LLAMA3_70B/square_bar_llama3_70B.geo"),
        ("LLaMA-3-8B", "LLAMA3_8B/square_bar_llama3_8B_updated.geo"),
        ("LLaMA-2-70B", "LLAMA2_70B/square_bar_llama2_70B_updated.geo"),
        ("GPT-4o", "GPT_4o/square_bar_gpt_4o.geo"),
        ("GPT-4", "GPT_4/square_bar_gpt_4.geo"),
        ("GPT-3.5", "GPT_35/square_bar_gpt_35_rec_1.geo")
    ]
    
    wheel_axle_cases = [
        ("PHI-3 Mini", "PHI_3_MINI/wheel_axle_phi3.geo"),
        ("Mixtral 8X22B", "MIXTRAL_8X22B/wheel_mixtral_8X22B_updated.geo"),
        ("Mixtral 8X7B", "MIXTRAL_8X7B/wheel_mixtral_8X7B_updated.geo"),
        ("LLaMA-3-70B", "LLAMA3_70B/wheel_llama3_70B_updated.geo"),
        ("LLaMA-3-8B", "LLAMA3_8B/wheel_llama3_8B_updated.geo"),
        ("LLaMA-2-70B", "LLAMA2_70B/wheel_axle_llama2_70B.geo"),
        ("GPT-4o", "GPT_4o/wheel_axle_gpt_4o_rectified_2.geo"),
        ("GPT-4", "GPT_4/wheel_axle_gpt_4.geo"),
        ("GPT-3.5", "GPT_35/wheel_axle_gpt_35_rectified_2.geo")
    ]
    
    # Check if we're in the right directory
    if not os.path.exists("PHI_3_MINI"):
        print("Warning: PHI_3_MINI directory not found. Please ensure you're running from the correct directory.")
        print(f"Current directory: {os.getcwd()}")
        print("Available directories:", [d for d in os.listdir('.') if os.path.isdir(d)])
    
    # Evaluate square bar geometries
    print("="*60)
    print("SQUARE BAR GEOMETRY EVALUATION")
    print("="*60)
    
    square_results = []
    for llm_name, geo_path in square_bar_cases:
        print(f"\nEvaluating {llm_name}...")
        try:
            result = validate_square_bar_geometry(geo_path)
            
            if not result['file_readable']:
                square_results.append({
                    'LLM': llm_name,
                    'Structure': 'File Error',
                    'Dimensions': '-',
                    'Quality': 'Failed',
                    'Score': '0%',
                    'Key Issues': 'Could not read file'
                })
            else:
                # Safely get dimensions status
                dim_status = result.get('dimensions', {}).get('status', '✗ Unknown')
                dim_symbol = dim_status.split(' ')[0] if ' ' in dim_status else dim_status
                
                square_results.append({
                    'LLM': llm_name,
                    'Structure': f"{result['structural_score']:.0%}",
                    'Dimensions': dim_symbol,
                    'Quality': result['category'],
                    'Score': f"{result['score']:.0%}",
                    'Key Issues': result['issues'][0] if result['issues'] else 'None'
                })
        except Exception as e:
            print(f"  Error evaluating {llm_name}: {str(e)}")
            square_results.append({
                'LLM': llm_name,
                'Structure': 'Error',
                'Dimensions': '-',
                'Quality': 'Failed',
                'Score': '0%',
                'Key Issues': f'Evaluation error: {str(e)[:50]}'
            })
    
    df_square = pd.DataFrame(square_results)
    
    # Evaluate wheel & axle geometries
    print("\n" + "="*60)
    print("WHEEL & AXLE GEOMETRY EVALUATION")
    print("="*60)
    
    wheel_results = []
    for llm_name, geo_path in wheel_axle_cases:
        print(f"\nEvaluating {llm_name}...")
        try:
            result = validate_wheel_axle_geometry(geo_path)
            
            if DEBUG and result['file_readable']:
                print(f"  Components: {result.get('component_score', 0):.0%}")
                print(f"  Assembly: {result.get('assembly_score', 0):.0%}")
                if 'assembly' in result:
                    print(f"  Boolean status: {result['assembly'].get('boolean_ops', 'Unknown')}")
            
            if not result['file_readable']:
                wheel_results.append({
                    'LLM': llm_name,
                    'Components': 'File Error',
                    'Boolean Ops': '-',
                    'Quality': 'Failed',
                    'Score': '0%',
                    'Key Issues': 'Could not read file'
                })
            else:
                # Extract Boolean operation status
                boolean_status = result['assembly'].get('boolean_ops', '')
                if '✓' in boolean_status:
                    has_boolean = '✓'
                elif '⚠' in boolean_status:
                    has_boolean = '⚠'  # Warning - present but broken
                else:
                    has_boolean = '✗'
                    
                wheel_results.append({
                    'LLM': llm_name,
                    'Components': f"{result['component_score']:.0%}",
                    'Boolean Ops': has_boolean,
                    'Quality': result['category'],
                    'Score': f"{result['score']:.0%}",
                    'Key Issues': result['issues'][0] if result['issues'] else 'None'
                })
        except Exception as e:
            print(f"  Error evaluating {llm_name}: {str(e)}")
            wheel_results.append({
                'LLM': llm_name,
                'Components': 'Error',
                'Boolean Ops': '-',
                'Quality': 'Failed',
                'Score': '0%',
                'Key Issues': f'Evaluation error: {str(e)[:50]}'
            })
    
    df_wheel = pd.DataFrame(wheel_results)
    
    return df_square, df_wheel

In [4]:
# ============================================================================
# Cell 4: Generate Summary and Analysis
# ============================================================================

def generate_summary_analysis(df_square, df_wheel):
    """
    Generate comprehensive analysis and insights
    """
    
    print("\n" + "="*60)
    print("SUMMARY ANALYSIS")
    print("="*60)
    
    # Overall performance summary
    summary_data = []
    
    for llm in df_square['LLM'].unique():
        sq_row = df_square[df_square['LLM'] == llm].iloc[0]
        wh_row = df_wheel[df_wheel['LLM'] == llm].iloc[0]
        
        # Convert scores to float for comparison
        sq_score = float(sq_row['Score'].strip('%')) / 100
        wh_score = float(wh_row['Score'].strip('%')) / 100
        
        # Determine overall capability
        if sq_score >= 0.8 and wh_score >= 0.7:
            capability = 'High'
        elif sq_score >= 0.6 or wh_score >= 0.6:
            capability = 'Medium'
        else:
            capability = 'Low'
        
        # Check Boolean operation status
        boolean_status = wh_row['Boolean Ops']
        if boolean_status == '✓':
            notes = 'Full CAD capability'
        elif boolean_status == '⚠':
            notes = 'Boolean ops fail (Volume reuse)'
        else:
            notes = 'Limited to single parts'
        
        summary_data.append({
            'LLM': llm,
            'Square Bar': sq_row['Quality'],
            'Wheel & Axle': wh_row['Quality'],
            'Boolean Ops': boolean_status,
            'Overall': capability,
            'Notes': notes
        })
    
    df_summary = pd.DataFrame(summary_data)
    
    # Key insights
    print("\nKEY FINDINGS:")
    print("-" * 40)
    
    # Boolean operations
    boolean_attempted = df_summary['Boolean Ops'].str.contains('✓|⚠').sum()
    boolean_working = (df_summary['Boolean Ops'] == '✓').sum()
    
    print(f"1. Boolean Operations:")
    print(f"   - {boolean_attempted}/{len(df_summary)} LLMs attempted Boolean operations")
    print(f"   - {boolean_working}/{len(df_summary)} LLMs have fully working Boolean operations")
    if 'GPT-4o' in df_summary.index and '⚠' in df_summary.loc['GPT-4o', 'Boolean Ops']:
        print(f"   - GPT-4o attempts Boolean but fails (Volume{{1}} reuse error)")
    print(f"   - Result: Assembly merging requires manual intervention for all LLMs")
    
    # Simple vs complex geometry
    sq_excellent = (df_square['Quality'] == 'Excellent').sum()
    wh_excellent = (df_wheel['Quality'] == 'Excellent').sum()
    print(f"2. Geometry Complexity: {sq_excellent} LLMs excelled at simple geometry vs {wh_excellent} at assemblies")
    
    # Score distribution
    sq_scores = [float(s.strip('%'))/100 for s in df_square['Score']]
    wh_scores = [float(s.strip('%'))/100 for s in df_wheel['Score']]
    print(f"3. Average Performance: Square bar {np.mean(sq_scores):.0%} vs Wheel & axle {np.mean(wh_scores):.0%}")
    
    return df_summary

In [5]:
# ============================================================================
# Cell 5: Create Publication-Ready Tables
# ============================================================================

def create_publication_tables(df_square, df_wheel, df_summary):
    """
    Create formatted tables for the paper
    """
    
    print("\n" + "="*60)
    print("TABLES FOR PUBLICATION")
    print("="*60)
    
    # Table 2: Square Bar Geometry Results
    print("\nTable 2: Square Bar Geometry Evaluation")
    print("-" * 40)
    print(df_square.to_string(index=False))
    
    # Save as CSV
    df_square.to_csv('square_bar_geometry_results.csv', index=False)
    
    # Table 3: Wheel & Axle Geometry Results
    print("\n\nTable 3: Wheel & Axle Assembly Evaluation")
    print("-" * 40)
    print(df_wheel.to_string(index=False))
    
    # Save as CSV
    df_wheel.to_csv('wheel_axle_geometry_results.csv', index=False)
    
    # Table 4: Summary Comparison
    print("\n\nTable 4: Overall Geometry Generation Capability")
    print("-" * 40)
    print(df_summary.to_string(index=False))
    
    # Save as CSV
    df_summary.to_csv('geometry_summary.csv', index=False)
    
    # LaTeX versions
    print("\n\nLaTeX Table for Square Bar:")
    print(df_square.to_latex(index=False, escape=False))
    
    print("\n\nLaTeX Table for Wheel & Axle:")
    print(df_wheel.to_latex(index=False, escape=False))

In [6]:
# ============================================================================
# Cell 6: Detailed Analysis of Specific Issues
# ============================================================================

def analyze_specific_issues():
    """
    Deep dive into common problems
    """
    
    print("\n" + "="*60)
    print("DETAILED ISSUE ANALYSIS")
    print("="*60)
    
    print("\nOur scoring correctly identifies:")
    print("1. Syntax errors (PHI-3 Mini)")
    print("2. Wrong dimensions (LLaMA-3-8B)")  
    print("3. Missing Boolean operations (GPT-4, others)")
    print("4. Broken Boolean operations (GPT-4o)")
    
    # Common failure modes
    issues = {
        'Dimension Errors': {
            'description': 'Incorrect size or proportions',
            'impact': 'Geometry unusable for intended analysis',
            'fix': 'Specify units clearly in prompt',
            'affected_models': ['Mixtral variants', 'PHI-3']
        },
        'Missing Boolean Operations': {
            'description': 'Components not merged into single volume',
            'impact': 'Cannot apply boundary conditions correctly',
            'fix': 'Requires manual CAD intervention',
            'affected_models': ['GPT-4, Mixtral variants, LLaMA variants, GPT-3.5, PHI-3']
        },
        'Boolean Operation Errors': {
            'description': 'Boolean syntax present but fails (Volume{1} reused after deletion)',
            'impact': 'Gmsh error: "Unknown OpenCASCADE entity"',
            'fix': 'Merge all volumes in single operation: BooleanUnion{Volume{1};Delete;}{Volume{2,3};Delete;}',
            'affected_models': ['GPT-4o']
        },
        'Incomplete Structure': {
            'description': 'Missing required geometric elements',
            'impact': 'Cannot generate mesh',
            'fix': 'Add missing elements manually',
            'affected_models': ['PHI-3', 'LLaMA-2-70B']
        }
    }
    
    for issue, details in issues.items():
        print(f"\n{issue}:")
        print(f"  Description: {details['description']}")
        print(f"  Impact: {details['impact']}")
        print(f"  Fix: {details['fix']}")
        print(f"  Affected: {', '.join(details['affected_models'])}")

In [7]:
# ============================================================================
# Cell 7: Generate Recommendations
# ============================================================================

def generate_recommendations():
    """
    Practical recommendations based on results
    """
    
    print("\n" + "="*60)
    print("RECOMMENDATIONS FOR PRACTITIONERS")
    print("="*60)
    
    recommendations = """
    1. FOR SIMPLE GEOMETRIES (Single Parts):
       - Use: GPT-4, GPT-4o, or LLaMA-3-70B
       - These consistently produce correct dimensions and complete structures
       - Minimal post-processing required
    
    2. FOR ASSEMBLIES (Multiple Components):
       - Challenge: No LLM correctly implements Boolean operations
       - GPT-4o attempts but fails (Volume{1} reuse error)
       - GPT-4 generates clean components but doesn't merge
       - Best approach: Use GPT-4 for clean component generation, merge manually
       
       Manual fix options:
       a) Single operation: BooleanUnion{Volume{1};Delete;}{Volume{2,3};Delete;}
       b) With intermediate: v()=BooleanUnion{Volume{1};Delete;}{Volume{2};Delete;}
                            BooleanUnion{Volume{v};Delete;}{Volume{3};Delete;}
    
    3. PROMPT ENGINEERING TIPS:
       - Always specify units explicitly (e.g., "10 centimeters" not "10")
       - Request "BooleanUnion" explicitly for assemblies
       - Include "Physical Volume" requirement for FEA compatibility
    
    4. VALIDATION CHECKLIST:
       ✓ Check element counts match expected topology
       ✓ Verify dimensions are in correct units
       ✓ Confirm single volume for FEA (assemblies)
       ✓ Test mesh generation before simulation
    
    5. COMMON FIXES:
       - Unit conversion: Multiply by 100 if dimensions seem off by 10^2
       - Missing elements: Usually surfaces need "Line Loop" definitions
       - Boolean failures: Check component ordering and overlap
    """
    
    print(recommendations)

In [8]:
# ============================================================================
# Cell 8: Main Execution
# ============================================================================

def main():
    """
    Run complete geometry evaluation pipeline
    """
    
    # Run evaluations
    df_square, df_wheel = evaluate_all_geometries()
    
    # Generate summary
    df_summary = generate_summary_analysis(df_square, df_wheel)
    
    # Create publication tables
    create_publication_tables(df_square, df_wheel, df_summary)
    
    # Analyze issues
    analyze_specific_issues()
    
    # Generate recommendations
    generate_recommendations()
    
    # Final summary statistics
    print("\n" + "="*60)
    print("FINAL STATISTICS")
    print("="*60)
    
    total_llms = len(df_square)
    
    # Success rates
    sq_good = ((df_square['Quality'] == 'Excellent') | (df_square['Quality'] == 'Good')).sum()
    wh_good = ((df_wheel['Quality'] == 'Excellent') | (df_wheel['Quality'] == 'Good')).sum()
    
    print(f"Square Bar Success Rate: {sq_good}/{total_llms} ({sq_good/total_llms*100:.0%})")
    print(f"Wheel & Axle Success Rate: {wh_good}/{total_llms} ({wh_good/total_llms*100:.0%})")
    
    # Count Boolean operation attempts and successes
    boolean_attempted = df_summary['Boolean Ops'].str.contains('✓|⚠').sum()
    boolean_working = (df_summary['Boolean Ops'] == '✓').sum()
    
    print(f"Boolean Operation Attempts: {boolean_attempted}/{total_llms} ({boolean_attempted/total_llms*100:.0%})")
    print(f"Boolean Operation Success: {boolean_working}/{total_llms} ({boolean_working/total_llms*100:.0%})")
    
    if boolean_attempted > 0 and boolean_working == 0:
        print("\nKey Finding: No LLM successfully implemented working Boolean operations for assemblies")
    
    # Final message
    print("\n" + "="*60)
    print("EVALUATION COMPLETE")
    print("="*60)
    print("\nKey Takeaways:")
    print("1. Most modern LLMs can generate valid single-part geometries")
    print("2. NO LLM successfully implements working Boolean operations for assemblies")
    print("3. GPT-4o attempts Boolean operations but fails with Volume{1} reuse error")
    print("4. Manual intervention required for all multi-component assemblies")
    
    return df_square, df_wheel, df_summary

In [9]:
# ============================================================================
# Cell 9: Test Boolean Validation
# ============================================================================

def test_boolean_validation():
    """
    Test the Boolean operation validation on known examples
    """
    print("TESTING BOOLEAN OPERATION VALIDATION")
    print("="*60)
    
    test_cases = [
        {
            'name': 'GPT-4o (Volume reuse error)',
            'content': '''
BooleanUnion{ Volume{1}; Delete; }{ Volume{2}; Delete; }
BooleanUnion{ Volume{1}; Delete; }{ Volume{3}; Delete; }
            ''',
            'expected': 'Should fail - Volume{1} reused'
        },
        {
            'name': 'Correct (single operation)',
            'content': '''
BooleanUnion{ Volume{1}; Delete; }{ Volume{2,3}; Delete; }
            ''',
            'expected': 'Should pass - all volumes merged at once'
        },
        {
            'name': 'Correct (with intermediate)',
            'content': '''
v() = BooleanUnion{ Volume{1}; Delete; }{ Volume{2}; Delete; };
BooleanUnion{ Volume{v}; Delete; }{ Volume{3}; Delete; }
            ''',
            'expected': 'Should pass - uses intermediate variable'
        }
    ]
    
    for test in test_cases:
        print(f"\n{test['name']}:")
        
        # Check for Boolean pattern
        boolean_pattern = r'BooleanUnion\s*\{\s*Volume\s*\{([^}]+)\}[^}]*\}\s*\{\s*Volume\s*\{([^}]+)\}[^}]*\}'
        boolean_ops = re.findall(boolean_pattern, test['content'])
        
        # Check for Volume{1} reuse
        vol1_count = sum(1 for op in boolean_ops if '1' in op[0])
        
        if vol1_count > 1:
            print(f"  Result: FAIL - Volume{{1}} appears {vol1_count} times")
        elif len(boolean_ops) > 0:
            print(f"  Result: PASS - Valid Boolean operations")
        else:
            print(f"  Result: UNKNOWN - Could not parse Boolean operations")
            
        print(f"  Expected: {test['expected']}")

In [10]:
# ============================================================================
# Cell 10: Main Execution
# ============================================================================

# Run everything
if __name__ == "__main__":
    # Optionally run Boolean validation test first
    # test_boolean_validation()
    
    # Run main analysis
    df_square, df_wheel, df_summary = main()

SQUARE BAR GEOMETRY EVALUATION

Evaluating PHI-3 Mini...
  Successfully read 1868 characters from PHI_3_MINI/square_bar_phi3.geo
    Variable L = 2.0
    Point 1: [0.0, 0.0, 0.0]
    Point 2: [0.0, 1.0, 0.0]
    Point 3: [-1.414213562373095, -1.414213562373095, 0.0]
  Found 3 points for PHI_3_MINI/square_bar_phi3.geo
  First few points: [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [-1.414213562373095, -1.414213562373095, 0.0]]

Evaluating Mixtral 8X22B...
  Successfully read 1309 characters from MIXTRAL_8X22B/square_bar_mixtral_8x22B_updated.geo
    Variable L = 10.0
    Variable W = 1.0
    Variable H = 1.0
    Variable lc = 0.1
    Point 1: [0.0, 0.0, 0.0]
    Point 2: [1.0, 0.0, 0.0]
    Point 3: [1.0, 1.0, 0.0]
    Point 4: [0.0, 1.0, 0.0]
    Point 5: [0.0, 0.0, 10.0]
    Point 6: [1.0, 0.0, 10.0]
    Point 7: [1.0, 1.0, 10.0]
    Point 8: [0.0, 1.0, 10.0]
  Found 8 points for MIXTRAL_8X22B/square_bar_mixtral_8x22B_updated.geo
  First few points: [[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0