### Load ML Results

In [8]:
import pandas as pd
import numpy as np

print("="*70)
print("PHASE 4: SIMPLE IMPLEMENTATION")
print("="*70)

# Load Phase 3 results
try:
    phase3_results = pd.read_csv(r'C:\Users\omgha\OneDrive\Documents\GitHub\Libaspace-AI-intern-challenge\Phase3\logistic_regression_results.csv')
    print(f"\n Loaded {len(phase3_results)} fields from Phase 3")
except FileNotFoundError:
    print("Could not find phase3_with_logistic_regression.csv")
    print("   Make sure you saved Phase 3 results to CSV")
    phase3_results = None

if phase3_results is not None:
    print(f"\nPhase 3 Summary:")
    accuracy = (phase3_results['lr_correct'].mean())
    print(f"  Accuracy: {accuracy*100:.1f}%")
    print(f"  Correct: {phase3_results['lr_correct'].sum()}/23")

PHASE 4: SIMPLE IMPLEMENTATION

 Loaded 23 fields from Phase 3

Phase 3 Summary:
  Accuracy: 87.0%
  Correct: 20/23


### Check Confidence Levels (Does ML Need Help?)

In [9]:
if phase3_results is not None:
    print("\n" + "="*70)
    print("CONFIDENCE ANALYSIS")
    print("="*70)
    
    print(f"\nConfidence Statistics:")
    print(f"  Min: {phase3_results['lr_confidence'].min():.3f}")
    print(f"  Max: {phase3_results['lr_confidence'].max():.3f}")
    print(f"  Mean: {phase3_results['lr_confidence'].mean():.3f}")
    print(f"  Median: {phase3_results['lr_confidence'].median():.3f}")
    
    # Find low-confidence cases
    LOW_CONF_THRESHOLD = 0.85
    uncertain_fields = phase3_results[
        phase3_results['lr_confidence'] < LOW_CONF_THRESHOLD
    ].copy()
    
    print(f"\nüîç Low-Confidence Cases (< {LOW_CONF_THRESHOLD}): {len(uncertain_fields)}")
    
    if len(uncertain_fields) == 0:
        print(f"\nALL PREDICTIONS ARE CONFIDENT!")
        print(f"   ‚Üí Phase 3 is already good")
        print(f"   ‚Üí No LLM needed!")
        print(f"   ‚Üí Just submit Phase 3 results")
    else:
        print(f"\nThese fields need Claude's help:")
        for idx, field in uncertain_fields.iterrows():
            print(f" {field['field_id']:20} | Conf: {field['lr_confidence']:.3f}")


CONFIDENCE ANALYSIS

Confidence Statistics:
  Min: 0.545
  Max: 0.995
  Mean: 0.824
  Median: 0.853

üîç Low-Confidence Cases (< 0.85): 10

These fields need Claude's help:
 first_name           | Conf: 0.807
 last_name            | Conf: 0.817
 question_7968643005  | Conf: 0.762
 question_7968646005  | Conf: 0.826
 question_7968647005  | Conf: 0.738
 question_7968651005  | Conf: 0.624
 question_7968652005  | Conf: 0.593
 question_7968655005  | Conf: 0.545
 question_7968656005  | Conf: 0.609
 4014112005           | Conf: 0.845


###  Create Simple Prompts (NOT Confusing Few-Shot Examples)

In [10]:
def create_simple_prompt(field_data):
    """
    Create a SIMPLE, CLEAR prompt for Claude
    NO confusing few-shot examples
    NO contradictory patterns
    
    Just: Field info + ask for classification
    """
    
    label = field_data.get('field_id', 'Unknown')
    options_count = int(field_data.get('options_count', 0))
    has_options = int(field_data.get('has_options', 0))
    is_yes_no = int(field_data.get('is_yes_no_question', 0))
    predicted = field_data.get('lr_prediction', 'unknown')
    confidence = float(field_data.get('lr_confidence', 0))
    
    # DOM snapshot
    dom = f"tag:{'select' if has_options else 'input'}, opts:{options_count}, yes_no:{is_yes_no}"
    
    # Simple, direct prompt - NO EXAMPLES
    prompt = f"""Classify this form field as TEXT input or SELECT dropdown.

Field Label: {label}
DOM: {dom}

Our ML model predicted: {predicted} (confidence: {confidence:.2f})

Based on this field information, is this:
(A) TEXT input - user types a response
(B) SELECT dropdown - user picks from options

Respond with ONE WORD: TEXT or SELECT"""
    
    return prompt

if phase3_results is not None and len(uncertain_fields) > 0:
    print("\n" + "="*70)
    print(f"SIMPLE PROMPTS FOR {len(uncertain_fields)} UNCERTAIN FIELDS")
    print("="*70)
    
    print(f"\nInstructions:")
    print(f"1. Go to https://claude.ai")
    print(f"2. Copy EACH prompt below")
    print(f"3. Paste into Claude")
    print(f"4. Record Claude's one-word answer (TEXT or SELECT)")
    print(f"5. Come back and update Step 4 with responses")
    
    # Show each prompt
    for i, (idx, field) in enumerate(uncertain_fields.iterrows(), 1):
        prompt = create_simple_prompt(field.to_dict())
        print(f"\n{'='*70}")
        print(f"PROMPT {i}: {field['field_id']}")
        print(f"{'='*70}")
        print(prompt)
        print(f"\n‚Üí Copy above, paste into claude.ai, record response")
else:
    if phase3_results is not None:
        print("\n‚úÖ All fields have high confidence - no prompts needed!")


SIMPLE PROMPTS FOR 10 UNCERTAIN FIELDS

Instructions:
1. Go to https://claude.ai
2. Copy EACH prompt below
3. Paste into Claude
4. Record Claude's one-word answer (TEXT or SELECT)
5. Come back and update Step 4 with responses

PROMPT 1: first_name
Classify this form field as TEXT input or SELECT dropdown.

Field Label: first_name
DOM: tag:input, opts:0, yes_no:0

Our ML model predicted: text (confidence: 0.81)

Based on this field information, is this:
(A) TEXT input - user types a response
(B) SELECT dropdown - user picks from options

Respond with ONE WORD: TEXT or SELECT

‚Üí Copy above, paste into claude.ai, record response

PROMPT 2: last_name
Classify this form field as TEXT input or SELECT dropdown.

Field Label: last_name
DOM: tag:input, opts:0, yes_no:0

Our ML model predicted: text (confidence: 0.82)

Based on this field information, is this:
(A) TEXT input - user types a response
(B) SELECT dropdown - user picks from options

Respond with ONE WORD: TEXT or SELECT

‚Üí Copy 

### Record Claude's Responses

In [11]:
# After getting Claude's responses from claude.ai, fill this dictionary
# Format: 'field_id': 'text' or 'select'

llm_responses = {
    'first_name': 'text',
    'last_name': 'text',
    'question_7968643005': 'text',
    'question_7968646005': 'text',
    'question_7968647005': 'text',
    'question_7968651005':'text',
    'question_7968652005': 'text',
    'question_7968655005': 'text',
    'question_7968656005': 'text',
    '4014112005': 'text'
}

print("\n" + "="*70)
print("STEP 4: RECORD CLAUDE'S RESPONSES")
print("="*70)

if len(llm_responses) == 0:
    print(f"\nWaiting for Claude's responses...")
    print(f"\nWhat to do:")
    print(f"1. Go to https://claude.ai")
    print(f"2. For EACH prompt above, ask Claude")
    print(f"3. Record Claude's answer (TEXT or SELECT)")
    print(f"4. Update the 'llm_responses' dictionary above")
    print(f"5. Run this cell again")
else:
    print(f"\nRecorded {len(llm_responses)} responses from Claude")
    for field_id, response in llm_responses.items():
        print(f"  {field_id}: {response.upper()}")


STEP 4: RECORD CLAUDE'S RESPONSES

Recorded 10 responses from Claude
  first_name: TEXT
  last_name: TEXT
  question_7968643005: TEXT
  question_7968646005: TEXT
  question_7968647005: TEXT
  question_7968651005: TEXT
  question_7968652005: TEXT
  question_7968655005: TEXT
  question_7968656005: TEXT
  4014112005: TEXT


### Apply Claude's Responses to Results

In [12]:
if phase3_results is not None:
    # Copy Phase 3 results
    final_results = phase3_results.copy()
    
    print("\n" + "="*70)
    print("APPLYING CLAUDE'S RESPONSES")
    print("="*70)
    
    if len(llm_responses) == 0:
        # No Claude responses yet, use Phase 3 as-is
        print(f"\n No Claude responses yet")
        print(f"   Using Phase 3 results as final predictions")
        
        final_results['final_prediction'] = final_results['lr_prediction']
        final_results['final_confidence'] = final_results['lr_confidence']
        final_results['decision_source'] = 'phase3'
    else:
        # Apply Claude's responses
        print(f"\nApplying {len(llm_responses)} Claude responses...")
        
        final_results['final_prediction'] = final_results['lr_prediction'].copy()
        final_results['final_confidence'] = final_results['lr_confidence'].copy()
        final_results['decision_source'] = 'phase3'
        
        # Update with Claude's answers
        for field_id, claude_pred in llm_responses.items():
            mask = final_results['field_id'] == field_id
            if mask.sum() > 0:
                # Use Claude's prediction
                final_results.loc[mask, 'final_prediction'] = claude_pred.lower()
                final_results.loc[mask, 'final_confidence'] = 0.90  # Trust Claude's answer
                final_results.loc[mask, 'decision_source'] = 'claude'
                print(f"  Updated {field_id}: Claude said {claude_pred.upper()}")
    
    # Mark which are correct
    final_results['final_correct'] = final_results['final_prediction'] == final_results['true_kind']
    
    print(f"\nApplied all responses")


APPLYING CLAUDE'S RESPONSES

Applying 10 Claude responses...
  Updated first_name: Claude said TEXT
  Updated last_name: Claude said TEXT
  Updated question_7968643005: Claude said TEXT
  Updated question_7968646005: Claude said TEXT
  Updated question_7968647005: Claude said TEXT
  Updated question_7968651005: Claude said TEXT
  Updated question_7968652005: Claude said TEXT
  Updated question_7968655005: Claude said TEXT
  Updated question_7968656005: Claude said TEXT
  Updated 4014112005: Claude said TEXT

Applied all responses


###  Calculate Final Accuracy

In [13]:
if phase3_results is not None:
    print("\n" + "="*70)
    print("FINAL RESULTS")
    print("="*70)
    
    # Calculate accuracy
    final_accuracy = final_results['final_correct'].mean()
    final_correct = final_results['final_correct'].sum()
    
    # Compare to Phase 3
    phase3_accuracy = (phase3_results['lr_correct']).mean()
    phase3_correct = phase3_results['lr_correct'].sum()
    
    print(f"\nACCURACY PROGRESSION:")
    print(f"  Phase 1 (Baseline):              73.9% (17/23)")
    print(f"  Phase 2 (Rules):                 87.0% (20/23)")
    print(f"  Phase 3 (Logistic Regression):   {phase3_accuracy*100:6.1f}% ({int(phase3_correct)}/23)")
    print(f"  Phase 4 (Claude Help):           {final_accuracy*100:6.1f}% ({int(final_correct)}/23)")
    
    improvement = (final_accuracy - phase3_accuracy) * 100
    if improvement > 0:
        print(f"\nIMPROVEMENT: +{improvement:.1f}% from Phase 3")
    elif improvement == 0:
        print(f"\n‚Üí No change from Phase 3 (still good!)")
    else:
        print(f"\nSlight decrease: {improvement:.1f}%")
    
    # Save results
    final_results.to_csv('phase4_final_results.csv', index=False)
    print(f"\nSaved: phase4_final_results.csv")


FINAL RESULTS

ACCURACY PROGRESSION:
  Phase 1 (Baseline):              73.9% (17/23)
  Phase 2 (Rules):                 87.0% (20/23)
  Phase 3 (Logistic Regression):     87.0% (20/23)
  Phase 4 (Claude Help):             87.0% (20/23)

‚Üí No change from Phase 3 (still good!)

Saved: phase4_final_results.csv


LLM based prompt can be use to classify since they tend to misclassify due to incomplete DOM elements