### ML predictor

In [30]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(r'C:\Users\omgha\OneDrive\Documents\GitHub\Libaspace-AI-intern-challenge\Phase2\dataset_with_rules.csv')

print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nBaseline accuracy: {df['is_correct'].mean()*100:.1f}%")
print(f"\nRule engine accuracy: {df['rule_correct'].mean()*100:.1f}%")
errors = df[df['rule_correct'] == 0]
for idx, row in errors.iterrows():
    print(f"  {row['field_id']}: Predicted {row['rule_based_prediction']} but was {row['true_kind']}")

Shape: (23, 16)

Columns: ['tag', 'type', 'options_count', 'has_options', 'keyword_select', 'is_yes_no_question', 'is_consent', 'is_demographic', 'is_required', 'label_length', 'field_id', 'predicted_kind', 'true_kind', 'is_correct', 'rule_based_prediction', 'rule_correct']

Baseline accuracy: 73.9%

Rule engine accuracy: 87.0%
  question_7968651005: Predicted text but was select
  question_7968652005: Predicted text but was select
  4014112005: Predicted text but was select


In [13]:
df.head()

Unnamed: 0,tag,type,options_count,has_options,keyword_select,is_yes_no_question,is_consent,is_demographic,is_required,label_length,field_id,predicted_kind,true_kind,is_correct,rule_based_prediction,rule_correct
0,input,text,0,0,0,0,0,0,1,10,first_name,text,text,1,text,1
1,input,text,0,0,0,0,0,0,1,9,last_name,text,text,1,text,1
2,input,text,0,0,0,0,0,0,1,5,email,text,text,1,text,1
3,input,text,2,1,0,0,0,0,1,8,country,select,select,1,select,1
4,input,tel,0,0,0,0,0,0,1,5,phone,text,text,1,text,1


### Prepare Features for ML model

In [20]:
feature_cols = [
    'options_count',     #How many options
    'has_options',       #Has options flag
    'is_yes_no_question',#Yes/No label
    'keyword_select',    #Select Keyword
    'is_required',       #Required field
    'label_length',      #Length of label
    # Additional features can be added here
    # 'is_consent'       #Consent keyword flag
    # 'is_demographic'   #Demographic keyword flag
    ]


print(f"Using {len(feature_cols)} features:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}")

x = df[feature_cols].copy()
y = df['true_kind'].copy()

print(f"\nFeature matrix shape: {x.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"\nFeature statistics:")
print(x.describe())

Using 6 features:
  1. options_count
  2. has_options
  3. is_yes_no_question
  4. keyword_select
  5. is_required
  6. label_length

Feature matrix shape: (23, 6)
Target shape: (23,)

Target distribution:
true_kind
select    12
text      11
Name: count, dtype: int64

Feature statistics:
       options_count  has_options  ...  is_required  label_length
count      23.000000    23.000000  ...         23.0     23.000000
mean        0.521739     0.260870  ...          1.0     19.130435
std         0.897956     0.448978  ...          0.0     12.491104
min         0.000000     0.000000  ...          1.0      4.000000
25%         0.000000     0.000000  ...          1.0      8.000000
50%         0.000000     0.000000  ...          1.0     16.000000
75%         1.000000     0.500000  ...          1.0     27.000000
max         2.000000     1.000000  ...          1.0     45.000000

[8 rows x 6 columns]


### Decision Tree Classifier

In [23]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=2)
dt_model.fit(x, y)

dt_preds = dt_model.predict(x)
dt_prob_preds = dt_model.predict_proba(x)

dt_accuracy = accuracy_score(y, dt_preds)
dt_correct = (dt_preds == y).sum()

print(f"\nDecision Tree Training Accuracy: {dt_accuracy*100:.2f}% ({dt_correct}/{len(y)})")
print("\nClassification Report:")
print(classification_report(y, dt_preds))
print("Confusion Matrix:")
print(confusion_matrix(y, dt_preds))
print("\nFeature Importances:")
dt_importances = pd.Series(dt_model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print(dt_importances)


Decision Tree Training Accuracy: 100.00% (23/23)

Classification Report:
              precision    recall  f1-score   support

      select       1.00      1.00      1.00        12
        text       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Confusion Matrix:
[[12  0]
 [ 0 11]]

Feature Importances:
label_length          0.768368
has_options           0.207431
is_yes_no_question    0.024200
options_count         0.000000
keyword_select        0.000000
is_required           0.000000
dtype: float64


### Random Forest Classifier

In [24]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=10, max_depth=5, min_samples_split=2)
rf_model.fit(x, y)

rf_preds = rf_model.predict(x)
rf_prob_preds = rf_model.predict_proba(x)

rf_accuracy = accuracy_score(y, rf_preds)
rf_correct = (rf_preds == y).sum()

print(f"\nRandom Forest Training Accuracy: {rf_accuracy*100:.2f}% ({rf_correct}/{len(y)})")
print("\nClassification Report:")
print(classification_report(y, rf_preds))
print("Confusion Matrix:")
print(confusion_matrix(y, rf_preds))
print("\nFeature Importances:")
rf_importances = pd.Series(rf_model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print(rf_importances)


Random Forest Training Accuracy: 100.00% (23/23)

Classification Report:
              precision    recall  f1-score   support

      select       1.00      1.00      1.00        12
        text       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Confusion Matrix:
[[12  0]
 [ 0 11]]

Feature Importances:
label_length          0.570758
is_yes_no_question    0.202151
options_count         0.149450
has_options           0.077641
keyword_select        0.000000
is_required           0.000000
dtype: float64


### Progress

In [26]:
# Phase 1: Baseline (Playwright predictions)
phase1_accuracy = (df['predicted_kind'] == df['true_kind']).mean()
phase1_correct = (df['predicted_kind'] == df['true_kind']).sum()

print(f"\nPhase 1 (Baseline - Playwright):")
print(f"  Accuracy: {phase1_accuracy*100:.1f}% ({int(phase1_correct)}/{len(df)})")

# Phase 2: Rules based engine
phase2_accuracy = df['rule_correct'].mean()
phase2_correct = df['rule_correct'].sum()

print(f"\nPhase 2 (Rules):")
print(f"  Accuracy: {phase2_accuracy*100:.1f}% ({int(phase2_correct)}/{len(df)})")
print(f"  Improvement from Phase 1: +{(phase2_accuracy - phase1_accuracy)*100:.1f}%")

# Phase 3: ML Models
print(f"\nPhase 3 (Decision Tree):")
print(f"  Accuracy: {dt_accuracy*100:.1f}% ({int(dt_correct)}/{len(y)})")
print(f"  Improvement from Phase 2: {(dt_accuracy - phase2_accuracy)*100:+.1f}%")

print(f"\nPhase 3 (Random Forest):")
print(f"  Accuracy: {rf_accuracy*100:.1f}% ({int(rf_correct)}/{len(y)})")
print(f"  Improvement from Phase 2: {(rf_accuracy - phase2_accuracy)*100:+.1f}%")

# Overall summary
print(f"\n" + "="*70)
print(f"OVERALL PROGRESSION")
print(f"="*70)
print(f"Phase 1 (Baseline):        {phase1_accuracy*100:6.1f}%")
print(f"Phase 2 (Rules):           {phase2_accuracy*100:6.1f}% (↑ {(phase2_accuracy-phase1_accuracy)*100:+.1f}%)")
print(f"Phase 3 (Decision Tree):   {dt_accuracy*100:6.1f}% (↑ {(dt_accuracy-phase2_accuracy)*100:+.1f}%)")
print(f"Phase 3 (Random Forest):   {rf_accuracy*100:6.1f}% (↑ {(rf_accuracy-phase2_accuracy)*100:+.1f}%)")
print(f"\nBest: {'Decision Tree' if dt_accuracy >= rf_accuracy else 'Random Forest'} at {max(dt_accuracy, rf_accuracy)*100:.1f}%")


Phase 1 (Baseline - Playwright):
  Accuracy: 73.9% (17/23)

Phase 2 (Rules):
  Accuracy: 87.0% (20/23)
  Improvement from Phase 1: +13.0%

Phase 3 (Decision Tree):
  Accuracy: 100.0% (23/23)
  Improvement from Phase 2: +13.0%

Phase 3 (Random Forest):
  Accuracy: 100.0% (23/23)
  Improvement from Phase 2: +13.0%

OVERALL PROGRESSION
Phase 1 (Baseline):          73.9%
Phase 2 (Rules):             87.0% (↑ +13.0%)
Phase 3 (Decision Tree):    100.0% (↑ +13.0%)
Phase 3 (Random Forest):    100.0% (↑ +13.0%)

Best: Decision Tree at 100.0%


In [28]:
# Choose best model
best_model = rf_model if rf_accuracy >= dt_accuracy else dt_model
best_predictions = rf_preds if rf_accuracy >= dt_accuracy else dt_predictions
best_accuracy = max(rf_accuracy, dt_accuracy)
best_name = 'Random Forest' if rf_accuracy >= dt_accuracy else 'Decision Tree'

print(f"Best Model: {best_name} ({best_accuracy*100:.1f}%)")
print(f"\nRemaining Errors: {(best_predictions != y).sum()}")

errors_phase3 = df.copy()
errors_phase3['phase3_pred'] = best_predictions
errors_phase3['phase3_correct'] = (best_predictions == y)

errors_only = errors_phase3[errors_phase3['phase3_correct'] == False]

if len(errors_only) > 0:
    print(f"\nFields still misclassified by Phase 3:")
    for idx, row in errors_only.iterrows():
        print(f"\n  {row['field_id']}")
        print(f"    True: {row['true_kind']}")
        print(f"    Phase 3 predicted: {row['phase3_pred']}")
        print(f"    Phase 2 predicted: {row['rule_based_prediction']}")
        print(f"    Key features:")
        print(f"      options_count: {row['options_count']}")
        print(f"      is_yes_no_question: {row['is_yes_no_question']}")
else:
    print(f"\nPerfect! No errors in Phase 3!")

Best Model: Random Forest (100.0%)

Remaining Errors: 0

Perfect! No errors in Phase 3!


In [32]:
# Stratified K-Fold for small dataset
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Decision Tree CV
dt_cv_scores = cross_val_score(dt_model, x, y, cv=skf, scoring='accuracy')
print(f"\nDecision Tree:")
print(f"  Fold scores: {[f'{s*100:.1f}%' for s in dt_cv_scores]}")
print(f"  Mean: {dt_cv_scores.mean()*100:.1f}%")
print(f"  Std: {dt_cv_scores.std()*100:.1f}%")

# Random Forest CV
rf_cv_scores = cross_val_score(rf_model, x, y, cv=skf, scoring='accuracy')
print(f"\nRandom Forest:")
print(f"  Fold scores: {[f'{s*100:.1f}%' for s in rf_cv_scores]}")
print(f"  Mean: {rf_cv_scores.mean()*100:.1f}%")
print(f"  Std: {rf_cv_scores.std()*100:.1f}%")

print(f"\nNote: Small dataset (23 samples) means CV scores will vary.")
print(f"In-sample accuracy is more stable and reliable here.")


Decision Tree:
  Fold scores: ['100.0%', '60.0%', '80.0%', '100.0%', '75.0%']
  Mean: 83.0%
  Std: 15.4%

Random Forest:
  Fold scores: ['100.0%', '60.0%', '80.0%', '100.0%', '75.0%']
  Mean: 83.0%
  Std: 15.4%

Note: Small dataset (23 samples) means CV scores will vary.
In-sample accuracy is more stable and reliable here.


In [39]:
# Create results dataframe
results_df = df.copy()
results_df['dt_prediction'] = dt_preds
results_df['rf_prediction'] = rf_preds
results_df['dt_confidence'] = dt_prob_preds.max(axis=1)
results_df['rf_confidence'] = rf_prob_preds.max(axis=1)
results_df['dt_correct'] = (dt_preds == y)
results_df['rf_correct'] = (rf_preds == y)

# Save results
results_df.to_csv('ML_results.csv', index=False)
print("Saved: ML_results.csv")

print(f"\nResults preview:")
print(results_df[['field_id', 'true_kind', 'rule_based_prediction', 'dt_prediction', 'rf_prediction', 'rule_correct', 'dt_correct', 'rf_correct']].head(10).to_string())

Saved: ML_results.csv

Results preview:
              field_id true_kind rule_based_prediction dt_prediction rf_prediction  rule_correct  dt_correct  rf_correct
0           first_name      text                  text          text          text             1        True        True
1            last_name      text                  text          text          text             1        True        True
2                email      text                  text          text          text             1        True        True
3              country    select                select        select        select             1        True        True
4                phone      text                  text          text          text             1        True        True
5  question_7968643005      text                  text          text          text             1        True        True
6  question_7968644005      text                  text          text          text             1        True     