Model fine-tune

The baseline model performance(Train+Val)



Fianl feature count 389

Weighted validation metrics (LogReg baseline):
 - accuracy : 0.9548147958276519
 - precision: 0.7283222226373169
 - recall   : 0.4084099405205401
 - f1       : 0.5233493786271743
 - roc_auc  : 0.9442860360723193

Weighted confusion matrix [[TN, FP],[FN, TP]]:
 - [[32035699.94000005   318739.25      ]
 - [ 1237740.37         854486.08      ]]

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
df = pd.read_csv('cleaned_data.csv')

df.head()

Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran_s_admin,veterans_benefits,weeks_worked_in_year,year,label
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,0
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,0
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,0
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,0
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,0


In [3]:
# same train test split strategy as the previous model

y = df['label']
w = df['weight']

X = df.drop(columns=['label','weight'])

# build the test set
test_mask = X['year'] == 95

X_test = X.loc[test_mask].drop(columns=['year'])
y_test = y.loc[test_mask]
w_test = w.loc[test_mask]

# build the train test
trainval_mask = X['year'] == 94

X_94 = X.loc[trainval_mask].drop(columns=["year"])
y_94 = y.loc[trainval_mask]
w_94 = w.loc[trainval_mask]

X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(X_94, y_94, w_94,test_size=0.2,
                                                                  random_state=42,stratify=y_94)

print('Train:',X_train.shape, 'Val:',X_val.shape,'Test(95)', X_test.shape)
print('Train y%', y_train.value_counts(normalize=True).round(4))
print('Val y%', y_val.value_counts(normalize=True).round(4))
print('Test y%', y_test.value_counts(normalize=True).round(4))

Train: (79861, 39) Val: (19966, 39) Test(95) (99696, 39)
Train y% label
0    0.9415
1    0.0585
Name: proportion, dtype: float64
Val y% label
0    0.9415
1    0.0585
Name: proportion, dtype: float64
Test y% label
0    0.9344
1    0.0656
Name: proportion, dtype: float64


Try to tune the decision to max out threshold for F1

In [4]:
numerical_col = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_col = [col for col in X_train.columns if col not in numerical_col]

numerical_tran = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('sclar', StandardScaler(with_mean=False))
])

cate_tran = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',  sparse_output=True))
])

preprocess = ColumnTransformer(transformers=[
    ('num',numerical_tran,numerical_col),
    ('cat',cate_tran,categorical_col),],
    remainder='drop')

preprocess.fit(X_train)
n_features = preprocess.transform(X_train.iloc[:5]).shape[1]
print('Fianl feature count', n_features)

log_reg = LogisticRegression()

baseline_model = Pipeline(steps=[
    ('preporcess',preprocess),
    ('logreg',log_reg)
])

baseline_model.fit(X_train,y_train,logreg__sample_weight = w_train)


proba_val = baseline_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.5).astype(int)

print("\nWeighted validation metrics (LogReg baseline):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)

Fianl feature count 389

Weighted validation metrics (LogReg baseline):
accuracy : 0.9548147958276519
precision: 0.7283222226373169
recall   : 0.4084099405205401
f1       : 0.5233493786271743
roc_auc  : 0.9442860360723193

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[32035699.94000005   318739.25      ]
 [ 1237740.37         854486.08      ]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
proba_val = baseline_model.predict_proba(X_val)[:, 1]

thresholds = np.linspace(0.01, 0.99, 99)
f1s = []
for t in thresholds:
    pred = (proba_val >= t).astype(int)
    f1s.append(f1_score(y_val, pred, sample_weight=w_val, zero_division=0))

best_idx = int(np.argmax(f1s))
best_t = float(thresholds[best_idx])
best_f1 = float(f1s[best_idx])

print("Best threshold:", best_t, "Best weighted F1:", best_f1)

Best threshold: 0.28 Best weighted F1: 0.5899422344742793


In [6]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
f1_weighted_scorer = make_scorer(f1_score, greater_is_better=True)


param_grid = {
    'logreg__C': [0.1, 0.3, 1.0, 3.0, 10.0],
    'logreg__penalty': ['l2', 'l1'],
    'logreg__class_weight': [None, 'balanced'],
}


grid = GridSearchCV(estimator=baseline_model,param_grid=param_grid,scoring=f1_weighted_scorer, 
    cv=5,n_jobs=-1,verbose=1)

# IMPORTANT: pass weights so each fold training is weighted
grid.fit(X_train, y_train, logreg__sample_weight=w_train)

print('\nBest params:', grid.best_params_)
print('Best CV F1:', grid.best_score_)

best_model = grid.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Suzreal\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Suzreal\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Suzreal\App


Best params: {'logreg__C': 0.3, 'logreg__class_weight': None, 'logreg__penalty': 'l2'}
Best CV F1: 0.49950140658498005


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
proba_val = best_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.28).astype(int) # -> the best threshold we find before

print("\nWeighted validation metrics (LogReg with threshold tune):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)


Weighted validation metrics (LogReg with threshold tune):
accuracy : 0.9475604074751891
precision: 0.5621708969064181
recall   : 0.6177203237249969
f1       : 0.5886379733209084
roc_auc  : 0.9442704549355191

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[31347885.73000008  1006553.46      ]
 [  799815.65        1292410.8       ]]


In [8]:
thresholds = np.linspace(0.01, 0.99, 99)
f1s = []
for t in thresholds:
    pred = (proba_val >= t).astype(int)
    f1s.append(f1_score(y_val, pred, sample_weight=w_val, zero_division=0))

best_idx = int(np.argmax(f1s))
best_t = float(thresholds[best_idx])
best_f1 = float(f1s[best_idx])

print("Best threshold:", best_t, "Best weighted F1:", best_f1)

Best threshold: 0.27 Best weighted F1: 0.588675573346752


The best log-reg model

In [9]:
proba_val = best_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.27).astype(int) # -> the best threshold we find before

print("\nWeighted validation metrics (LogReg Best):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)


Weighted validation metrics (LogReg Best):
accuracy : 0.9468017401407912
precision: 0.5549586715196956
recall   : 0.626754479659695
f1       : 0.588675573346752
roc_auc  : 0.9442704549355191

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[31302850.67000009  1051588.52      ]
 [  780914.15        1311312.3       ]]


XGBoost

In [10]:

numerical_col = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_col = [col for col in X_train.columns if col not in numerical_col]

numerical_tran = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

cate_tran = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',  sparse_output=True))
])

preprocess = ColumnTransformer(transformers=[
    ('num',numerical_tran,numerical_col),
    ('cat',cate_tran,categorical_col),],
    remainder='drop')

xgb = XGBClassifier(n_estimators=600,learning_rate=0.05,max_depth=6,subsample=0.8,colsample_bytree=0.8,
    reg_lambda=1.0,min_child_weight=1,objective='binary:logistic',eval_metric='logloss',tree_method='hist',
    random_state=42,n_jobs=-1)

xgb_model  = Pipeline(steps=[
    ('preporcess',preprocess),
    ('xgb',xgb)
])

xgb_model.fit(X_train,y_train,xgb__sample_weight=w_train)


proba_val = xgb_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.5).astype(int)

print("\nWeighted validation metrics (xgb baseline):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)


Weighted validation metrics (xgb baseline):
accuracy : 0.9583445801403262
precision: 0.7325156020783501
recall   : 0.49489560749984857
f1       : 0.5907046490446546
roc_auc  : 0.9530064617470841

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[31976341.64000006   378097.55      ]
 [ 1056792.77        1035433.68      ]]


In [11]:
thresholds = np.linspace(0.01, 0.99, 99)
f1s = []
for t in thresholds:
    pred = (proba_val >= t).astype(int)
    f1s.append(f1_score(y_val, pred, sample_weight=w_val, zero_division=0))

best_idx = int(np.argmax(f1s))
best_t = float(thresholds[best_idx])
print('Best threshold (XGB):', best_t)
print('Best weighted F1 (XGB):', float(f1s[best_idx]))

Best threshold (XGB): 0.35000000000000003
Best weighted F1 (XGB): 0.6146306619467286


In [12]:

proba_val = xgb_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.35).astype(int)

print("\nWeighted validation metrics (XGB with threshold tune):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)


Weighted validation metrics (XGB with threshold tune):
accuracy : 0.9552375322443544
precision: 0.644141644612749
recall   : 0.5877052792253922
f1       : 0.6146306619467286
roc_auc  : 0.9530064617470841

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[31675135.35000006   679303.84      ]
 [  862613.92        1229612.53      ]]


Grid for xgb

In [13]:
candidates = [
    {'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8},
    {'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8},
    {'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8},
    {'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8},
]

best = None

for params in candidates:
    xgb = XGBClassifier(n_estimators=800,learning_rate=0.05,reg_lambda=1.0,objective='binary:logistic',
        eval_metric='logloss',tree_method='hist',random_state=42,n_jobs=-1,**params)

    model = Pipeline(steps=[
        ('preprocess', preprocess),
        ('xgb', xgb)
    ])

    model.fit(X_train, y_train, xgb__sample_weight=w_train)
    p = model.predict_proba(X_val)[:, 1]

    thresholds = np.linspace(0.01, 0.99, 99)
    f1s = []
    for t in thresholds:
        pred = (p >= t).astype(int)
        f1s.append(f1_score(y_val, pred, sample_weight=w_val, zero_division=0))

    idx = int(np.argmax(f1s))
    t_best = float(thresholds[idx])
    f1_best = float(f1s[idx])

    print('params:', params, '| best_t:', t_best, '| weighted_f1:', f1_best)

    if (best is None) or (f1_best > best['f1']):
        best = {'model': model, 'params': params, 'threshold': t_best, 'f1': f1_best}

print('\nBest XGB config:', best['params'])
print('Best XGB threshold:', best['threshold'])
print('Best XGB weighted F1:', best['f1'])
best_xgb_model = best['model']
best_xgb_threshold = best['threshold']


params: {'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8} | best_t: 0.3 | weighted_f1: 0.6154688239623524
params: {'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8} | best_t: 0.3 | weighted_f1: 0.6117843544607207
params: {'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8} | best_t: 0.36000000000000004 | weighted_f1: 0.6127647959570288
params: {'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8} | best_t: 0.29000000000000004 | weighted_f1: 0.603853138445889

Best XGB config: {'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8}
Best XGB threshold: 0.3
Best XGB weighted F1: 0.6154688239623524


In [14]:
proba_val = best_xgb_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >=best_xgb_threshold).astype(int)

print("\nWeighted validation metrics (XGB best):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")


Weighted validation metrics (XGB best):
accuracy : 0.9520629573481121
precision: 0.6001251932845427
recall   : 0.6316176339325035
f1       : 0.6154688239623524
roc_auc  : 0.9538848152081866

Weighted confusion matrix [[TN, FP],[FN, TP]]:


In [15]:
# Refit best model on full 1994 pool
best_xgb_model.fit(X_94, y_94, xgb__sample_weight=w_94)

# Then run the same test evaluation again
proba_test = best_xgb_model.predict_proba(X_test)[:, 1]
pred_test = (proba_test >= best_xgb_threshold).astype(int)

print('\n=== FINAL TEST (1995) METRICS after refit on all 1994 ===')
print('threshold:', best_xgb_threshold)
print('accuracy :', accuracy_score(y_test, pred_test, sample_weight=w_test))
print('precision:', precision_score(y_test, pred_test, sample_weight=w_test, zero_division=0))
print('recall   :', recall_score(y_test, pred_test, sample_weight=w_test, zero_division=0))
print('f1       :', f1_score(y_test, pred_test, sample_weight=w_test, zero_division=0))
print('roc_auc  :', roc_auc_score(y_test, proba_test, sample_weight=w_test))
print('\nWeighted confusion matrix [[TN, FP],[FN, TP]]:')
print(confusion_matrix(y_test, pred_test, sample_weight=w_test))



=== FINAL TEST (1995) METRICS after refit on all 1994 ===
threshold: 0.3
accuracy : 0.9485960213365202
precision: 0.6168715359516521
recall   : 0.626852113732052
f1       : 0.6218217789953105
roc_auc  : 0.9509990638915975

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[1.58314410e+08 4.58476792e+06]
 [4.39423785e+06 7.38189170e+06]]
