# Step 2 – Feature Engineering

Basic imports


In [17]:
import pandas as pd, numpy as np
train = pd.read_csv('../data/train.csv')
test  = pd.read_csv('../data/test.csv')

Handle literal 

In [18]:
for col in ['ethnicity', 'relation']:
    for df in (train, test):
        df[col] = df[col].astype(str).replace('?', np.nan)

Build feature matrix

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# Columns to use
num_cols   = ['age', 'result']          # we’ll test with/without 'result'
cat_cols   = ['gender','ethnicity','jaundice','austim','contry_of_res','used_app_before','relation']
a_cols     = [f'A{i}_Score' for i in range(1,11)]

X = train[a_cols + num_cols + cat_cols]
y = train['Class/ASD']

pre = ColumnTransformer([
        ('a', 'passthrough', a_cols),
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
      ])

model = Pipeline([('prep', pre),
                  ('clf',  LogisticRegression(max_iter=1000, class_weight='balanced'))])

# 80/20 stratified split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

 Fit & score

In [20]:
model.fit(X_tr, y_tr)
pred = model.predict_proba(X_val)[:,1]
auc_score = roc_auc_score(y_val, pred)
print(f'Validation AUC: {auc_score:.3f}')

Validation AUC: 0.900


Remove leaking result

In [21]:
# Create dataset without the 'result' column
X_no_result = train[a_cols + ['age'] + cat_cols]

# Create new pipeline without 'result' feature
pre2 = ColumnTransformer([
        ('a', 'passthrough', a_cols),
        ('num', 'passthrough', ['age']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
      ])

model2 = Pipeline([('prep', pre2),
                   ('clf',  LogisticRegression(max_iter=1000, class_weight='balanced'))])

# Split data properly
X_tr2, X_val2, y_tr2, y_val2 = train_test_split(X_no_result, y, 
                                                test_size=0.2, 
                                                random_state=42, 
                                                stratify=y)

# Fit and evaluate model
model2.fit(X_tr2, y_tr2)
pred2 = model2.predict_proba(X_val2)[:,1]
auc_score2 = roc_auc_score(y_val2, pred2)
print(f'AUC (no result): {auc_score2:.3f}')

AUC (no result): 0.917


In [22]:
auc = roc_auc_score(y_val, pred2)
print('New AUC (no result):', round(auc, 3))

New AUC (no result): 0.917


Fairness snapshot

In [23]:
import pandas as pd
from sklearn.metrics import roc_auc_score
val_df = X_val.copy()
val_df['true'] = y_val
val_df['prob'] = pred2
for grp in ['gender','ethnicity']:
    print('\nAUC by', grp)
    print(val_df.groupby(grp).apply(lambda d: roc_auc_score(d['true'], d['prob'])).round(3))


AUC by gender
gender
f    0.896
m    0.920
dtype: float64

AUC by ethnicity
ethnicity
Asian              1.000
Black                NaN
Hispanic             NaN
Latino               NaN
Middle Eastern     0.947
Others             0.667
Pasifika           0.562
South Asian        0.750
Turkish              NaN
White-European     0.806
dtype: float64


  print(val_df.groupby(grp).apply(lambda d: roc_auc_score(d['true'], d['prob'])).round(3))
  print(val_df.groupby(grp).apply(lambda d: roc_auc_score(d['true'], d['prob'])).round(3))


In [24]:
auc = roc_auc_score(y_val, pred2)
print('New AUC (no result):', round(auc, 3))

New AUC (no result): 0.917


In [25]:
val_df['ethnicity'].value_counts()

ethnicity
White-European     54
Middle Eastern     20
Asian              15
Pasifika           10
South Asian         9
Black               8
Others              7
Latino              2
Hispanic            1
Turkish             1
Name: count, dtype: int64

collapse rare ethnicities

In [26]:
for df in (train, test):
    mask = df['ethnicity'].isin(['Hispanic','Latino','Turkish','others']) | \
           (df['ethnicity'].isna()) | \
           (df['ethnicity'] == '?')
    df['eth_group'] = df['ethnicity'].where(~mask, 'Other')

# sanity counts on full train
print('Full-train counts:')
print(train['eth_group'].value_counts())

Full-train counts:
eth_group
White-European     257
Other              237
Middle Eastern      97
Asian               67
Black               47
South Asian         34
Pasifika            32
Others              29
Name: count, dtype: int64


 final grouping

In [27]:
for df in (train, test):
    df['eth_group'] = df['ethnicity'].astype(str).replace({'?':'Other','others':'Other'})
    # combine the two Others
    df['eth_group'] = np.where(df['eth_group'].isin(['Others','Other']),'Other',df['eth_group'])

In [28]:
val_df['eth_group'] = val_df['ethnicity'].astype(str).replace({'?':'Other','others':'Other'})
val_df['eth_group'] = np.where(val_df['eth_group'].isin(['Others','Other']),'Other',val_df['eth_group'])
print('\nAUC by eth_group')
print(val_df.groupby('eth_group').apply(lambda d: roc_auc_score(d['true'], d['prob'])).round(3))


AUC by eth_group
eth_group
Asian              1.000
Black                NaN
Hispanic             NaN
Latino               NaN
Middle Eastern     0.947
Other              0.667
Pasifika           0.562
South Asian        0.750
Turkish              NaN
White-European     0.806
nan                  NaN
dtype: float64


  print(val_df.groupby('eth_group').apply(lambda d: roc_auc_score(d['true'], d['prob'])).round(3))


full-train confusion-matrix counts by group

In [29]:
from sklearn.metrics import confusion_matrix

# fit on full train, predict on full train (quick sanity)
model2.fit(X_no_result, y)
y_pred_full = model2.predict(X_no_result)

def cm_stats(group_col, group_val):
    mask = train[group_col] == group_val
    cm = confusion_matrix(y[mask], y_pred_full[mask])
    tn, fp, fn, tp = cm.ravel()
    return {'N': mask.sum(), 'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn,
            'Recall': tp/(tp+fn+1e-9), 'FPR': fp/(fp+tn+1e-9)}

# eth_group counts
stats = pd.DataFrame([cm_stats('eth_group', g) for g in train['eth_group'].unique()])
stats.index = train['eth_group'].unique()
stats.round(2)

Unnamed: 0,N,TP,FP,FN,TN,Recall,FPR
,203,4,4,3,192,0.57,0.02
White-European,257,116,70,5,66,0.96,0.51
Middle Eastern,97,5,4,1,87,0.83,0.04
Pasifika,32,6,8,0,18,1.0,0.31
Black,47,5,6,1,35,0.83,0.15
Other,32,1,3,1,27,0.5,0.1
Hispanic,9,2,2,0,5,1.0,0.29
Asian,67,3,2,1,61,0.75,0.03
Turkish,5,0,1,0,4,0.0,0.2
South Asian,34,2,1,1,30,0.67,0.03


Interpretation of the full-train table

- **White-European** dominates (257 rows) but has **very high FPR (0.51)** → many false alarms.  
- **Small groups** (Turkish, Other, South-Asian) have **≤ 5 ASD cases** → metrics unstable.  
- **Pasifika & Hispanic** show **perfect recall** but tiny N, so not reliable.

Decision  
Because sample sizes are **too small for stable per-ethnicity thresholds**, we will:

1. **Keep a single global model** (no per-group tuning).  
2. **Flag small-group uncertainty** in the interface.  
3. **Proceed to final model training + calibration** on the entire train set.


# Step 3 – Final Model & Calibration

full-pipeline with calibration

In [30]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

# Re-fit on full training data
model2.fit(X_no_result, y)

# Calibrate via Platt scaling
cal = CalibratedClassifierCV(model2, method='sigmoid', cv=5)
cal.fit(X_no_result, y)

# Predict probabilities on training data for sanity
proba_train = cal.predict_proba(X_no_result)[:,1]
brier = brier_score_loss(y, proba_train)
print(f'Brier score (lower = better): {brier:.3f}')

Brier score (lower = better): 0.081


# Save Calibrated Model

In [None]:
import os
import joblib

# Create models directory if it doesn't exist
models_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')), '..', 'models')
os.makedirs(models_dir, exist_ok=True)

# Save the calibrated model
model_path = os.path.join(models_dir, 'calibrated_lr.pkl')
joblib.dump(cal, model_path)
print(f'Model saved to {model_path}')

Model saved to models/calibrated_lr.pkl
