### Load the dataset

In [50]:
import numpy as np
import pandas as pd

In [51]:
df = pd.read_csv(r'C:\Users\omgha\OneDrive\Documents\GitHub\Libaspace-AI-intern-challenge\Phase1\dataset.csv')

In [52]:
df.head()

Unnamed: 0,tag,type,options_count,has_options,keyword_select,is_yes_no_question,is_consent,is_demographic,is_required,label_length,field_id,predicted_kind,true_kind,is_correct
0,input,text,0,0,0,0,0,0,1,10,first_name,text,text,1
1,input,text,0,0,0,0,0,0,1,9,last_name,text,text,1
2,input,text,0,0,0,0,0,0,1,5,email,text,text,1
3,input,text,2,1,0,0,0,0,1,8,country,select,select,1
4,input,tel,0,0,0,0,0,0,1,5,phone,text,text,1


In [53]:
df.columns.tolist()

['tag',
 'type',
 'options_count',
 'has_options',
 'keyword_select',
 'is_yes_no_question',
 'is_consent',
 'is_demographic',
 'is_required',
 'label_length',
 'field_id',
 'predicted_kind',
 'true_kind',
 'is_correct']

In [54]:
correct = df[df['is_correct'] == 1]
total = len(df)
baseline_accuracy = len(correct) / total*100
print(f"Overall Accuracy: {baseline_accuracy:.2f}%")

Overall Accuracy: 73.91%


In [55]:
errors = df[df['is_correct'] == 0]
errors_count = len(errors)
error_rate = errors_count / total
print(f"Total Errors: {errors_count}")
print(f"Error Rate: {error_rate:.2%}")

Total Errors: 6
Error Rate: 26.09%


In [56]:
errors[['field_id', 'predicted_kind', 'true_kind', 'options_count', 'is_yes_no_question']].head(10)


Unnamed: 0,field_id,predicted_kind,true_kind,options_count,is_yes_no_question
10,question_7968648005,text,select,0,1
11,question_7968649005,text,select,0,1
12,question_7968650005,text,select,0,1
13,question_7968651005,text,select,0,0
14,question_7968652005,text,select,0,0
21,4014112005,text,select,0,0


In [57]:
errors['predicted_kind'].value_counts()

predicted_kind
text    6
Name: count, dtype: int64

In [58]:
errors['true_kind'].value_counts()

true_kind
select    6
Name: count, dtype: int64

In [59]:
print(f"\nPattern: ALL errors are {errors['predicted_kind'].unique()[0]} -> {errors['true_kind'].unique()[0]}")


Pattern: ALL errors are text -> select


#### Analysis of Errors
Predicted types in errors: 
text    (6)<br>
Name: predicted_kind, dtype: int64<br>

Correct types should be: 
select    (6)<br>
Name: true_kind, dtype: int64<br>
<br>
Pattern: ALL errors are text -> select<br>
<br>
Feature analysis of MISCLASSIFIED fields:<br>
  - Average options_count: 0.00
  - Average label_has_yes_or_no: 0.50
  - Average has_options: 0.00

Feature analysis of CORRECT fields:<br>
  - Average options_count: 0.71
  - Average label_has_yes_or_no: 0.06
  - Average has_options: 0.61

### Create Rule Function

In [60]:
def apply_rules_v1(row):
    """
    Rule-based classifier for form field types
    
    Rules (in priority order):
    1. If it has options, it's a select
    2. If has_options flag is set, it's a select
    3. If label has yes/no, likely select
    4. If select keywords found, it's a select
    5. Default: text
    """
    
    # Rule 1 - Options count is strong indicator
    if row['options_count'] > 0:
        return 'select'
    
    # Rule 2 - has_options flag
    if row['has_options'] == 1:
        return 'select'
    
    # Rule 3 - Yes/No keywords
    if row['is_yes_no_question'] == 1:
        return 'select'
    
    # Rule 4 - Select keywords found
    if row['keyword_select'] > 0:
        return 'select'
    
    # Default - predict text
    return 'text'


# Test the function on first row
test_row = df.iloc[0]
prediction = apply_rules_v1(test_row)
print(f"Test prediction for {test_row['field_id']}: {prediction}")

Test prediction for first_name: text


In [61]:
test_row = df.iloc[0]
prediction = apply_rules_v1(test_row)
print(f"Test prediction for {test_row['field_id']}: {prediction}")

Test prediction for first_name: text


In [62]:
df['rule_based_prediction'] = df.apply(apply_rules_v1, axis=1)

In [63]:
print(df[['field_id', 'predicted_kind', 'rule_based_prediction', 'true_kind']])

               field_id predicted_kind rule_based_prediction true_kind
0            first_name           text                  text      text
1             last_name           text                  text      text
2                 email           text                  text      text
3               country         select                select    select
4                 phone           text                  text      text
5   question_7968643005           text                  text      text
6   question_7968644005           text                  text      text
7   question_7968645005           text                  text      text
8   question_7968646005           text                  text      text
9   question_7968647005           text                  text      text
10  question_7968648005           text                select    select
11  question_7968649005           text                select    select
12  question_7968650005           text                select    select
13  qu

In [64]:
# Calculate accuracy
df['rule_correct'] = (df['rule_based_prediction'] == df['true_kind']).astype(int)
rule_accuracy = df['rule_correct'].mean()*100
correct_count = df['rule_correct'].sum()

In [65]:
print(f"Accuracy: {correct_count}/{len(df)} = {rule_accuracy:.1f}%")
print(f"Improvement from baseline: {rule_accuracy - baseline_accuracy:.1f}%")

Accuracy: 20/23 = 87.0%
Improvement from baseline: 13.0%


In [66]:
df.to_csv('dataset_with_rules.csv', index=False)

In [67]:
output_df = df[['field_id', 'predicted_kind', 'rule_based_prediction', 'true_kind', 'rule_correct']]
output_df.columns = ['field_id', 'playwright', 'rule_engine', 'true', 'correct']

output_df.to_csv('rule_engine_results.csv', index=False)
print(f"\nResults saved to: rule_engine_results.csv")


Results saved to: rule_engine_results.csv


#### Rule Engine Results
Baseline (Phase 1): 73.9%<br>
Rule Engine (v1): 87.0%<br>
Improvement: 13.0%<br>
<br>
Target was: 75-80%<br>
STATUS: Exceeded target!<br>
