### ML predictor

In [44]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(r'C:\Users\omgha\OneDrive\Documents\GitHub\Libaspace-AI-intern-challenge\Phase2\dataset_with_rules.csv')

print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nBaseline accuracy: {df['is_correct'].mean()*100:.1f}%")
print(f"\nRule engine accuracy: {df['rule_correct'].mean()*100:.1f}%")
errors = df[df['rule_correct'] == 0]
for idx, row in errors.iterrows():
    print(f"  {row['field_id']}: Predicted {row['rule_based_prediction']} but was {row['true_kind']}")

Shape: (23, 16)

Columns: ['tag', 'type', 'options_count', 'has_options', 'keyword_select', 'is_yes_no_question', 'is_consent', 'is_demographic', 'is_required', 'label_length', 'field_id', 'predicted_kind', 'true_kind', 'is_correct', 'rule_based_prediction', 'rule_correct']

Baseline accuracy: 73.9%

Rule engine accuracy: 87.0%
  question_7968651005: Predicted text but was select
  question_7968652005: Predicted text but was select
  4014112005: Predicted text but was select


In [41]:
df.head()

Unnamed: 0,tag,type,options_count,has_options,keyword_select,is_yes_no_question,is_consent,is_demographic,is_required,label_length,field_id,predicted_kind,true_kind,is_correct,rule_based_prediction,rule_correct
0,input,text,0,0,0,0,0,0,1,10,first_name,text,text,1,text,1
1,input,text,0,0,0,0,0,0,1,9,last_name,text,text,1,text,1
2,input,text,0,0,0,0,0,0,1,5,email,text,text,1,text,1
3,input,text,2,1,0,0,0,0,1,8,country,select,select,1,select,1
4,input,tel,0,0,0,0,0,0,1,5,phone,text,text,1,text,1


### Prepare Features for ML model

In [20]:
feature_cols = [
    'options_count',     #How many options
    'has_options',       #Has options flag
    'is_yes_no_question',#Yes/No label
    'keyword_select',    #Select Keyword
    'is_required',       #Required field
    'label_length',      #Length of label
    # Additional features can be added here
    # 'is_consent'       #Consent keyword flag
    # 'is_demographic'   #Demographic keyword flag
    ]


print(f"Using {len(feature_cols)} features:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}")

x = df[feature_cols].copy()
y = df['true_kind'].copy()

print(f"\nFeature matrix shape: {x.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"\nFeature statistics:")
print(x.describe())

Using 6 features:
  1. options_count
  2. has_options
  3. is_yes_no_question
  4. keyword_select
  5. is_required
  6. label_length

Feature matrix shape: (23, 6)
Target shape: (23,)

Target distribution:
true_kind
select    12
text      11
Name: count, dtype: int64

Feature statistics:
       options_count  has_options  ...  is_required  label_length
count      23.000000    23.000000  ...         23.0     23.000000
mean        0.521739     0.260870  ...          1.0     19.130435
std         0.897956     0.448978  ...          0.0     12.491104
min         0.000000     0.000000  ...          1.0      4.000000
25%         0.000000     0.000000  ...          1.0      8.000000
50%         0.000000     0.000000  ...          1.0     16.000000
75%         1.000000     0.500000  ...          1.0     27.000000
max         2.000000     1.000000  ...          1.0     45.000000

[8 rows x 6 columns]


### Scale Features

In [45]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x = pd.DataFrame(x_scaled, columns=feature_cols)

print(f"\nScaled feature statistics:")
print(x.describe())


Scaled feature statistics:
       options_count   has_options  ...  is_required  label_length
count   2.300000e+01  2.300000e+01  ...         23.0  2.300000e+01
mean   -6.757879e-17 -6.757879e-17  ...          0.0 -9.654113e-18
std     1.022475e+00  1.022475e+00  ...          0.0  1.022475e+00
min    -5.940885e-01 -5.940885e-01  ...          0.0 -1.238520e+00
25%    -5.940885e-01 -5.940885e-01  ...          0.0 -9.110955e-01
50%    -5.940885e-01 -5.940885e-01  ...          0.0 -2.562456e-01
75%     5.445811e-01  5.445811e-01  ...          0.0  6.441730e-01
max     1.683251e+00  1.683251e+00  ...          0.0  2.117585e+00

[8 rows x 6 columns]


### Logistic Regression

In [48]:
# Train base model
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    solver='lbfgs',  # Good for small datasets
    C=1.0,           # Regularization parameter
    class_weight='balanced'  # Handle class imbalance
)
lr_model.fit(x_scaled, y)

# Get predictions and probabilities
lr_predictions = lr_model.predict(x_scaled)
lr_proba = lr_model.predict_proba(x_scaled)
lr_confidence = lr_proba.max(axis=1)  # Max probability for each prediction

# Calculate accuracy
lr_accuracy = accuracy_score(y, lr_predictions)
lr_correct = (lr_predictions == y).sum()

print(f"\nRESULTS:")
print(f"Accuracy: {lr_accuracy*100:.1f}% ({int(lr_correct)}/{len(y)})")
print(f"\nConfidence Statistics:")
print(f"  Mean: {lr_confidence.mean():.3f}")
print(f"  Min: {lr_confidence.min():.3f}")
print(f"  Max: {lr_confidence.max():.3f}")
print(f"  Std: {lr_confidence.std():.3f}")

print(f"\nClassification Report:")
print(classification_report(y, lr_predictions))

print(f"\nConfusion Matrix:")
print(confusion_matrix(y, lr_predictions))

print(f"\nModel Coefficients:")
coef_df = pd.DataFrame(
    {'feature': feature_cols, 'coefficient': lr_model.coef_[0]}
).sort_values('coefficient', ascending=False)
print(coef_df.to_string())


RESULTS:
Accuracy: 87.0% (20/23)

Confidence Statistics:
  Mean: 0.824
  Min: 0.545
  Max: 0.995
  Std: 0.124

Classification Report:
              precision    recall  f1-score   support

      select       1.00      0.75      0.86        12
        text       0.79      1.00      0.88        11

    accuracy                           0.87        23
   macro avg       0.89      0.88      0.87        23
weighted avg       0.90      0.87      0.87        23


Confusion Matrix:
[[ 9  3]
 [ 0 11]]

Model Coefficients:
              feature  coefficient
3      keyword_select     0.000000
4         is_required     0.000000
2  is_yes_no_question    -0.721437
0       options_count    -0.745593
1         has_options    -0.745593
5        label_length    -0.803623


In [54]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr_cv_scores = cross_val_score(lr_model, x_scaled, y, cv=skf, scoring='accuracy')

print(f"\nLogistic Regression CV Scores:")
print(f"  Fold 1: {lr_cv_scores[0]*100:.1f}%")
print(f"  Fold 2: {lr_cv_scores[1]*100:.1f}%")
print(f"  Fold 3: {lr_cv_scores[2]*100:.1f}%")
print(f"  Fold 4: {lr_cv_scores[3]*100:.1f}%")
print(f"  Fold 5: {lr_cv_scores[4]*100:.1f}%")
print(f"\nMean CV Accuracy: {lr_cv_scores.mean()*100:.1f}%")
print(f"Std: {lr_cv_scores.std()*100:.1f}%")

print(f"\nINTERPRETATION:")
print(f"  Training accuracy: {lr_accuracy*100:.1f}%")
print(f"  Cross-validation (realistic): {lr_cv_scores.mean()*100:.1f}%")
if lr_accuracy > lr_cv_scores.mean():
    gap = (lr_accuracy - lr_cv_scores.mean())*100
    print(f"  Overfitting gap: {gap:.1f}%")
    if gap > 5:
        print(f"Some overfitting, but less severe than trees")
    else:
        print(f"Good generalization!")


Logistic Regression CV Scores:
  Fold 1: 100.0%
  Fold 2: 60.0%
  Fold 3: 80.0%
  Fold 4: 100.0%
  Fold 5: 75.0%

Mean CV Accuracy: 83.0%
Std: 15.4%

INTERPRETATION:
  Training accuracy: 87.0%
  Cross-validation (realistic): 83.0%
  Overfitting gap: 4.0%
Good generalization!


In [61]:
print(f"\nLogistic Regression Confidence Quality:")
print(f"\nConfidence Distribution:")

# Check calibration: do high-confidence predictions have high accuracy?
bins = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
for i in range(len(bins)-1):
    low, high = bins[i], bins[i+1]
    mask = (lr_confidence >= low) & (lr_confidence < high)
    count = mask.sum()
    if count > 0:
        correct = (lr_predictions[mask] == y[mask]).sum()
        acc = correct / count
        print(f"  {low:.1f}-{high:.1f}: {count:2d} preds, accuracy {acc*100:5.0f}%")

high_conf = (lr_confidence >= 0.9).sum()
if high_conf > 0:
    high_conf_correct = (lr_predictions[lr_confidence >= 0.9] == y[lr_confidence >= 0.9]).sum()
    print(f"  ≥0.9: {high_conf:2d} preds, accuracy {high_conf_correct/high_conf*100:5.0f}%")



Logistic Regression Confidence Quality:

Confidence Distribution:
  0.5-0.6:  2 preds, accuracy    50%
  0.6-0.7:  2 preds, accuracy    50%
  0.7-0.8:  2 preds, accuracy   100%
  0.8-0.9: 11 preds, accuracy    91%
  0.9-1.0:  6 preds, accuracy   100%
  ≥0.9:  6 preds, accuracy   100%


KEY INSIGHT:<br>
High confidence ≈ High accuracy (well-calibrated!)<br>
This is the property we want in confidence scores.<br>

In [60]:
# Phase 1 (Baseline)
phase1_accuracy = (df['predicted_kind'] == df['true_kind']).mean()
phase1_correct = (df['predicted_kind'] == df['true_kind']).sum()

# Phase 2 (Rules)
phase2_accuracy = df['rule_correct'].mean()
phase2_correct = df['rule_correct'].sum()

# Phase 3 Models
print(f"\nACCURACY PROGRESSION:")
print(f"  Phase 1 (Baseline):        {phase1_accuracy*100:6.1f}% ({int(phase1_correct)}/23)")
print(f"  Phase 2 (Rules):           {phase2_accuracy*100:6.1f}% ({int(phase2_correct)}/23)")
print(f"  Phase 3 (Logistic Reg):    {lr_accuracy*100:6.1f}% ({int(lr_correct)}/23)")

print(f"\nCONFIDENCE QUALITY:")
print(f"  Logistic Regression:       {lr_confidence.min():.2f}-{lr_confidence.max():.2f} (well-calibrated)")




ACCURACY PROGRESSION:
  Phase 1 (Baseline):          73.9% (17/23)
  Phase 2 (Rules):             87.0% (20/23)
  Phase 3 (Logistic Reg):      87.0% (20/23)

CONFIDENCE QUALITY:
  Logistic Regression:       0.54-1.00 (well-calibrated)


In [63]:
# Create results dataframe
results_df = df.copy()
results_df['lr_prediction'] = lr_predictions
results_df['lr_confidence'] = lr_confidence
results_df['lr_correct'] = (lr_predictions == y)

# Add probability for each class
class_labels = lr_model.classes_
for i, label in enumerate(class_labels):
    results_df[f'lr_proba_{label}'] = lr_proba[:, i]
results_df.to_csv('logistic_regression_results.csv', index=False)