In [1]:
import json
import pandas as pd

### Loading data

In [2]:
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
raw_data = data['raw_form']
standard_data = data['standard_form']

In [4]:
print(f"Raw form: {len(raw_data)} fields")
print(f"Standard data: {len(standard_data)} fields")

Raw form: 23 fields
Standard data: 26 fields


In [5]:
print("Example field raw_data[0]")
field = raw_data[0]
for key, value in field.items():
    if isinstance(value, list):
        print(f"{key}: {type(value).__name__}  (length = {len(value)})")
    else:
        print(f"{key}: {value}")
print("\n")
print("Example field standard_data[0]")
field = standard_data[0]
for key, value in field.items():
    if isinstance(value, list):
        print(f"{key}: {type(value).__name__}  (length = {len(value)})")
    else:
        print(f"{key}: {value}")

Example field raw_data[0]
tag: input
id: first_name
type: text
label: First Name
input_kind: text
options: None
required: True


Example field standard_data[0]
tag: input
id: first_name
type: text
label: First Name
input_kind: text
required: True


### Find Misclassifications

In [6]:
# Building lookup of correct types
correct_types = {}
for field in standard_data:
    field_id = field.get('id')
    input_kind = field.get('input_kind')
    correct_types[field_id] = input_kind

In [7]:
correct_types

{'first_name': 'text',
 'last_name': 'text',
 'email': 'text',
 'country': 'select',
 'phone': 'text',
 'resume': 'text',
 'question_7968643005': 'text',
 'question_7968644005': 'text',
 'question_7968645005': 'text',
 'question_7968646005': 'text',
 'question_7968647005': 'text',
 'question_7968648005': 'select',
 'question_7968649005': 'select',
 'question_7968650005': 'select',
 'question_7968651005': 'select',
 'question_7968652005': 'select',
 'question_7968653005': 'select',
 'question_7968654005': 'select',
 'question_7968655005': 'text',
 'question_7968656005': 'text',
 'question_7968657005': 'select',
 'question_7968658005': 'select',
 '4014112005': 'select',
 '4014113005': 'select',
 'question_7978023005': 'select',
 'question_7978031005': 'text'}

In [8]:
# find misclassifications
misclassifications = []
for field in raw_data:
    field_id = field.get('id')
    # check field that exist in standard form
    if field_id in correct_types:
        raw_kind = field.get('input_kind')
        standard_kind = correct_types[field_id]
    
    if raw_kind != standard_kind:
        misclassifications.append({
                'field_id': field_id,
                'label': field.get('label'),
                'raw_input_kind': raw_kind,
                'standard_input_kind': standard_kind,
            })

In [9]:
print(f" No. of Misclassifications: {len(misclassifications)}")

 No. of Misclassifications: 6


In [10]:
for i, m in enumerate(misclassifications, 1):
    print(f"{i}. Field ID: {m['field_id']}")
    print(f"   Label: {m['label'][:60]}")
    print(f"   Raw predicted: {m['raw_input_kind']:10s} → Should be: {m['standard_input_kind']}")
    print()

1. Field ID: question_7968648005
   Label: Have you ever applied for employment with us?
   Raw predicted: text       → Should be: select

2. Field ID: question_7968649005
   Label: Are you 18 years of age or older?
   Raw predicted: text       → Should be: select

3. Field ID: question_7968650005
   Label: Are you legally eligible for employment?
   Raw predicted: text       → Should be: select

4. Field ID: question_7968651005
   Label: hibu Applicant Statement
   Raw predicted: text       → Should be: select

5. Field ID: question_7968652005
   Label: Drug-Free Workplace Policy
   Raw predicted: text       → Should be: select

6. Field ID: 4014112005
   Label: Gender
   Raw predicted: text       → Should be: select



In [11]:
print(f"Summary: {len(raw_data)} fields, {len(misclassifications)} errors")
print(f"Accuracy: {(len(raw_data) - len(misclassifications)) / len(raw_data) * 100:.1f}%")

Summary: 23 fields, 6 errors
Accuracy: 73.9%


### Examine Misclassification

In [12]:
target_id = "question_7968648005"

In [13]:
raw_field = None
for f in raw_data:
    if f.get('id') == target_id:
        raw_field = f
        break

standard_field = None
for f in standard_data:
    if f.get('id') == target_id:
        standard_field = f
        break


In [14]:
if raw_field and standard_field:
    print("=" * 80)
    print("RAW FORM (What Playwright extracted)")
    print("=" * 80)
    print(f"label: {raw_field.get('label')}")
    print(f"type: {raw_field.get('type')}")
    print(f"input_kind: {raw_field.get('input_kind')}")
    options = raw_field.get('options')
    print(f"options: {options}")
    print()
    
    print("=" * 80)
    print("STANDARD FORM (What's actually correct)")
    print("=" * 80)
    print(f"label: {standard_field.get('label')}")
    print(f"type: {standard_field.get('type')}")
    print(f"input_kind: {standard_field.get('input_kind')}")
    options = standard_field.get('options')
    print(f"options: {options}")
    print()

RAW FORM (What Playwright extracted)
label: Have you ever applied for employment with us?
type: text
input_kind: text
options: None

STANDARD FORM (What's actually correct)
label: Have you ever applied for employment with us?
type: text
input_kind: select
options: ['Yes', 'No']



#### ANALYSIS
The PROBLEM:
- Playwright says it's 'text'
- But it's actually 'select'

The REASON:
- Raw form has 0 options
- Standard form has 2 options
- Standard options are: ['Yes', 'No']

CONCLUSION:
- Playwright didn't capture the dropdown options
- Without options, it looks like a text field
- But the label 'Have you ever applied...' is a yes/no question
- Yes/No questions should be 'select' fields with options

### Extract features from fields

In [15]:
def extract_features(field):
    """
    Convert field dictionary to feature dictionary

    Args:
        field: Dictionary containing field information
    """
    features = {}
    # Feature 1: HTML Tag
    features['tag'] = field.get('tag', '')

    # Feature 2: HTML Type attribute
    features['type'] = field.get('type', '')

    # Feature 3: Number of Options
    options = field.get('options')
    features['options_count'] = len(options) if options else 0
    features['has_options'] = 1 if options else 0

    # Feature 4: Text features - analyze label
    label = (field.get('label') or '').lower()

    # Select keywords
    select_keywords = ['select', 'choose', 'pick', 'option', 'dropdown']
    keyword_count = sum(1 for kw in select_keywords if kw in label)
    features['keyword_select'] = keyword_count

    # Yes/no question patterns - detect questions that expect yes/no answers
    yes_no_patterns = [
        'are you', 'do you', 'have you', 'did you', 'will you',
        'can you', 'would you', 'could you', 'should you'
    ]
    features['is_yes_no_question'] = 1 if any(pattern in label for pattern in yes_no_patterns) else 0

    # Consent/agreement patterns
    consent_keywords = ['accept', 'agree', 'consent', 'certify', 'authorize', 'acknowledge', 'statement', 'policy']
    features['is_consent'] = 1 if any(kw in label for kw in consent_keywords) else 0

    # Demographics patterns
    demographic_keywords = ['gender', 'ethnicity', 'race', 'veteran', 'disability', 'origin']
    features['is_demographic'] = 1 if any(kw in label for kw in demographic_keywords) else 0

    features['is_required'] = 1 if field.get('required') else 0
    features['label_length'] = len(label)

    return features

In [16]:
test_fields = ['first_name', 'country', 'phone', 'question_7968648005']

for field_id in test_fields:
    field = None
    for f in raw_data:
        if f.get('id') == field_id:
            field = f
            break
        
    if field:
        features = extract_features(field)
        print(f"\nField: {field.get('label')[:40]}")
        print(f"ID: {field_id}")
        print(f"Raw input_kind: {field.get('input_kind')}")
        print("Features:")
        for key, value in features.items():
            print(f"  {key:25s}: {value}")


Field: First Name
ID: first_name
Raw input_kind: text
Features:
  tag                      : input
  type                     : text
  options_count            : 0
  has_options              : 0
  keyword_select           : 0
  is_yes_no_question       : 0
  is_consent               : 0
  is_demographic           : 0
  is_required              : 1
  label_length             : 10

Field: Country*
ID: country
Raw input_kind: select
Features:
  tag                      : input
  type                     : text
  options_count            : 2
  has_options              : 1
  keyword_select           : 0
  is_yes_no_question       : 0
  is_consent               : 0
  is_demographic           : 0
  is_required              : 1
  label_length             : 8

Field: Phone
ID: phone
Raw input_kind: text
Features:
  tag                      : input
  type                     : tel
  options_count            : 0
  has_options              : 0
  keyword_select           : 0
  is_yes_no_question  

### Building a Dataframe

In [17]:
dataset = []

for field in raw_data:
    field_id = field.get('id')
    if field_id in correct_types:
        features = extract_features(field)
        features['field_id'] = field_id
        features['predicted_kind'] =  field.get('input_kind')
        features['true_kind'] = correct_types[field_id]
        features['is_correct'] = 1 if features['predicted_kind'] == features['true_kind'] else 0
        dataset.append(features)

In [18]:
df = pd.DataFrame(dataset)

In [19]:
df.shape

(23, 14)

In [20]:
len(df)

23

In [21]:
len(df.columns)

14

In [22]:
df.head()

Unnamed: 0,tag,type,options_count,has_options,keyword_select,is_yes_no_question,is_consent,is_demographic,is_required,label_length,field_id,predicted_kind,true_kind,is_correct
0,input,text,0,0,0,0,0,0,1,10,first_name,text,text,1
1,input,text,0,0,0,0,0,0,1,9,last_name,text,text,1
2,input,text,0,0,0,0,0,0,1,5,email,text,text,1
3,input,text,2,1,0,0,0,0,1,8,country,select,select,1
4,input,tel,0,0,0,0,0,0,1,5,phone,text,text,1


In [23]:
correct = df['is_correct'].sum()
total = len(df)
accuracy = correct/total * 100
accuracy

73.91304347826086

In [24]:
df['predicted_kind'].value_counts()

predicted_kind
text      17
select     6
Name: count, dtype: int64

In [25]:
df['true_kind'].value_counts()

true_kind
select    12
text      11
Name: count, dtype: int64

In [26]:
df.to_csv('dataset.csv', index=False)

### Finding important features

In [27]:
correct_df = df[df['is_correct'] == 1]
misclass_df = df[df['is_correct'] == 0]

In [28]:
print(len(correct_df))
print(len(misclass_df))

17
6


In [29]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [30]:
results = []
for cols in numeric_cols:
    if cols is ['is_correct']:
        continue
    
    correct_mean = correct_df[cols].mean()
    misclass_mean = misclass_df[cols].mean()
    diff = abs(correct_mean - misclass_mean)
    
    results.append({
        'feature': cols,
        'correct_mean': correct_mean,
        'misclass_mean': misclass_mean,
        'difference': diff
    })

In [31]:
results = sorted(results, key= lambda x: x['difference'], reverse=True)

In [32]:
# Print top features
for r in results[:10]:
    print(f"{r['feature']:25s}")
    print(f"  Correct avg:      {r['correct_mean']:8.3f}")
    print(f"  Misclassified avg: {r['misclass_mean']:8.3f}")
    print(f"  Difference:        {r['difference']:8.3f} ← IMPORTANCE")
    print()

label_length             
  Correct avg:        15.647
  Misclassified avg:   29.000
  Difference:          13.353 ← IMPORTANCE

is_correct               
  Correct avg:         1.000
  Misclassified avg:    0.000
  Difference:           1.000 ← IMPORTANCE

options_count            
  Correct avg:         0.706
  Misclassified avg:    0.000
  Difference:           0.706 ← IMPORTANCE

is_yes_no_question       
  Correct avg:         0.118
  Misclassified avg:    0.500
  Difference:           0.382 ← IMPORTANCE

has_options              
  Correct avg:         0.353
  Misclassified avg:    0.000
  Difference:           0.353 ← IMPORTANCE

is_consent               
  Correct avg:         0.000
  Misclassified avg:    0.333
  Difference:           0.333 ← IMPORTANCE

is_demographic           
  Correct avg:         0.059
  Misclassified avg:    0.167
  Difference:           0.108 ← IMPORTANCE

keyword_select           
  Correct avg:         0.000
  Misclassified avg:    0.000
  Difference

#### INTERPRETATION

The MOST IMPORTANT feature is the one with the BIGGEST difference.

Top finding: option_count
  - Correct predictions have options (avg ~0.7-1.0)
  - Misclassified have NO options (avg 0.0)
  - CONCLUSION: If a field has options → it's almost always 'select'

Second finding: has_options
  - Binary version of option_count
  - Same conclusion

Third finding: label_has_yes_no
  - Misclassified are MORE likely to have yes/no in label
  - Makes sense: yes/no questions should be select fields

### Discover Patterns in Errors

In [33]:
misclassifications

[{'field_id': 'question_7968648005',
  'label': 'Have you ever applied for employment with us?',
  'raw_input_kind': 'text',
  'standard_input_kind': 'select'},
 {'field_id': 'question_7968649005',
  'label': 'Are you 18 years of age or older?',
  'raw_input_kind': 'text',
  'standard_input_kind': 'select'},
 {'field_id': 'question_7968650005',
  'label': 'Are you legally eligible for employment?',
  'raw_input_kind': 'text',
  'standard_input_kind': 'select'},
 {'field_id': 'question_7968651005',
  'label': 'hibu Applicant Statement',
  'raw_input_kind': 'text',
  'standard_input_kind': 'select'},
 {'field_id': 'question_7968652005',
  'label': 'Drug-Free Workplace Policy',
  'raw_input_kind': 'text',
  'standard_input_kind': 'select'},
 {'field_id': '4014112005',
  'label': 'Gender',
  'raw_input_kind': 'text',
  'standard_input_kind': 'select'}]

In [34]:
misclassified_ids = df[df['is_correct'] == 0]['field_id'].tolist()
misclassified_ids

['question_7968648005',
 'question_7968649005',
 'question_7968650005',
 'question_7968651005',
 'question_7968652005',
 '4014112005']

In [35]:
patterns = {
    'yes_or_no_questions': [],
    'consent_questions': [],
    'demographics': [],
    'other': []
}

In [36]:
for misclass_id in misclassified_ids:
    # Find in both datasets
    raw_field = next((f for f in raw_data if f['id'] == misclass_id), None)
    std_field = next((f for f in standard_data if f['id'] == misclass_id), None)
    
    if raw_field and std_field:
        label = raw_field.get('label', '')
        raw_kind = raw_field.get('input_kind')
        std_kind = std_field.get('input_kind')
        std_options = std_field.get('options', [])
        
        print(f"Field ID: {misclass_id}")
        print(f"  Label: {label[:60]}")
        print(f"  Classification: {raw_kind} → {std_kind}")
        print(f"  Options: {std_options}")
        
        # Classify the pattern
        label_lower = label.lower()
        
        if any(q in label_lower for q in ['are you', 'do you']):
            print(f"  PATTERN: YES/NO QUESTION")
            patterns['yes_or_no_questions'].append(misclass_id)
        elif any(word in label_lower for word in ['agree', 'accept', 'certify', 'consent', 'authorize']):
            print(f"  PATTERN: CONSENT/AGREEMENT")
            patterns['consent_agreement'].append(misclass_id)
        elif any(demo in label_lower for demo in ['gender', 'ethnic', 'race', 'origin']):
            print(f"  PATTERN: DEMOGRAPHIC")
            patterns['demographics'].append(misclass_id)
        else:
            print(f"  PATTERN: OTHER")
            patterns['other'].append(misclass_id)
        
        print()

Field ID: question_7968648005
  Label: Have you ever applied for employment with us?
  Classification: text → select
  Options: ['Yes', 'No']
  PATTERN: OTHER

Field ID: question_7968649005
  Label: Are you 18 years of age or older?
  Classification: text → select
  Options: ['Yes', 'No']
  PATTERN: YES/NO QUESTION

Field ID: question_7968650005
  Label: Are you legally eligible for employment?
  Classification: text → select
  Options: ['Yes', 'No']
  PATTERN: YES/NO QUESTION

Field ID: question_7968651005
  Label: hibu Applicant Statement
  Classification: text → select
  Options: ['I Accept', 'I Decline']
  PATTERN: OTHER

Field ID: question_7968652005
  Label: Drug-Free Workplace Policy
  Classification: text → select
  Options: ['I Accept', 'I Decline']
  PATTERN: OTHER

Field ID: 4014112005
  Label: Gender
  Classification: text → select
  Options: ['Male', 'Female', 'I do not wish to answer']
  PATTERN: DEMOGRAPHIC



#### PATTERN SUMMARY

- Yes/No Questions: 3 cases
    - Have you ever applied for employment with us?
    - Are you 18 years of age or older?
    - Are you legally eligible for employment in the United States?

- Consent/Agreement: 2 cases
    - hibu Applicant Statement PLEASE READ CAREFULLY...
    - To View a copy of the hibu Drug-Free Workplace/Use of Alcohol Policy...

- Demographics: 1 case
    - Gender*

- Other: 0 cases

#### KEY INSIGHT

All 6 misclassifications follow PREDICTABLE PATTERNS:
  1. Yes/No questions (3 fields) - have labels like 'Are you...?'
  2. Consent fields (2 fields) - have labels with 'agree', 'accept', 'certify'
  3. Demographics (1 field) - gender/ethnicity fields