In [None]:
import os

import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

from scripts.model_training import train_logistic_regression

In [None]:
model_prefix = 'lr_clf'

In [None]:
training_filepath_X = '/app/datamart/gold/train_view/gold_train_2023-11-01_30dpd_6_ready_X.parquet'
training_filepath_Y = '/app/datamart/gold/train_view/gold_train_2023-11-01_30dpd_6_ready_Y.parquet'
validation_filepath_X = '/app/datamart/gold/validation_view/gold_valid_2023-11-01_30dpd_6_ready_X.parquet'
validation_filepath_Y = '/app/datamart/gold/validation_view/gold_valid_2023-11-01_30dpd_6_ready_Y.parquet'
testing_filepath_X = '/app/datamart/gold/test_view/gold_test_2023-11-01_30dpd_6_ready_X.parquet'
testing_filepath_Y = '/app/datamart/gold/test_view/gold_test_2023-11-01_30dpd_6_ready_Y.parquet'
oot_filepath_X = '/app/datamart/gold/oot_view/gold_oot_2023-11-01_30dpd_6_ready_X.parquet'
oot_filepath_Y = '/app/datamart/gold/oot_view/gold_oot_2023-11-01_30dpd_6_ready_Y.parquet'

current_date = '2023-11-01'

In [None]:
df_train_X = pd.read_parquet(training_filepath_X)
df_train_Y = pd.read_parquet(training_filepath_Y)
df_val_X = pd.read_parquet(validation_filepath_X)
df_val_Y = pd.read_parquet(validation_filepath_Y)
df_test_X = pd.read_parquet(testing_filepath_X)
df_test_Y = pd.read_parquet(testing_filepath_Y)
df_oot_X = pd.read_parquet(oot_filepath_X)
df_oot_Y = pd.read_parquet(oot_filepath_Y)

In [114]:
# Meaningless to maintain differentiation between training and validation because we will be using Search CV.
df_train_val_X = pd.concat([df_train_X, df_val_X], axis=0, ignore_index=True)
df_train_val_Y = pd.concat([df_train_Y, df_val_Y], axis=0, ignore_index=True)

In [115]:
lr_clf = LogisticRegression(solver='liblinear', tol=1e-4, fit_intercept=True, class_weight='balanced', random_state=42)

# Hyperparameter space
param_distributions = {
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 100, 1000],  # Regularization strength
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 500, 1000, 10000]
}

# Set up the random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=lr_clf,
    param_distributions=param_distributions,
    scoring=metrics.make_scorer(metrics.recall_score),
    n_iter=100,  # Number of iterations for random search
    cv=5,       # Number of folds in cross-validation
    random_state=42,
    n_jobs=-1
)

In [116]:
# Perform the random search
random_search.fit(df_train_val_X, df_train_val_Y.values.ravel())



0,1,2
,estimator,LogisticRegre...r='liblinear')
,param_distributions,"{'C': [0.0001, 0.001, ...], 'max_iter': [100, 200, ...], 'penalty': ['l1', 'l2']}"
,n_iter,100
,scoring,make_scorer(r...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.001
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,100


In [111]:
# Output the best parameters and best score
print("Logistic Regression: Best parameters: ", random_search.best_params_)
print("Logistic Regression: Best Recall: ", random_search.best_score_)

# Evaluate the model on the train set
best_model = random_search.best_estimator_

y_pred = best_model.predict(df_train_val_X)
train_f1_score = metrics.f1_score(df_train_val_Y.values.ravel(), y_pred)
train_auc_score = metrics.roc_auc_score(df_train_val_Y.values.ravel(), y_pred)
print("Logistic Regression: Train Recall score: ", random_search.best_score_)
print("Logistic Regression: Train F1 score: ", train_f1_score)
print("Logistic Regression: Train GINI score: ", round(2*train_auc_score-1,3))


y_pred = best_model.predict(df_test_X)
test_recall_score = metrics.recall_score(df_test_Y.values.ravel(), y_pred)
test_f1_score = metrics.f1_score(df_test_Y.values.ravel(), y_pred)
test_auc_score = metrics.roc_auc_score(df_test_Y.values.ravel(), y_pred)
print("Logistic Regression: Test Recall score: ", test_recall_score)
print("Logistic Regression: Test F1 score: ", test_f1_score)
print("Logistic Regression: Test GINI score: ", round(2*test_auc_score-1,3))

y_pred = best_model.predict(df_oot_X)
oot_recall_score = metrics.recall_score(df_oot_Y.values.ravel(), y_pred)
oot_f1_score = metrics.f1_score(df_oot_Y.values.ravel(), y_pred)
oot_auc_score = metrics.roc_auc_score(df_oot_Y.values.ravel(), y_pred)
print("Logistic Regression: OOT Recall score: ", oot_recall_score)
print("Logistic Regression: OOT F1 score: ", oot_f1_score)
print("Logistic Regression: OOT GINI score: ", round(2*oot_auc_score-1,3))

Logistic Regression: Best parameters:  {'penalty': 'l1', 'max_iter': 100, 'C': 0.001}
Logistic Regression: Best Recall:  0.702221830365015
Logistic Regression: Train Recall score:  0.702221830365015
Logistic Regression: Train F1 score:  0.5282758620689655
Logistic Regression: Train GINI score:  0.31
Logistic Regression: Test Recall score:  0.7457627118644068
Logistic Regression: Test F1 score:  0.5751633986928104
Logistic Regression: Test GINI score:  0.403
Logistic Regression: OOT Recall score:  0.794392523364486
Logistic Regression: OOT F1 score:  0.5501618122977346
Logistic Regression: OOT GINI score:  0.348


In [None]:
# Save to model bank
filename = model_prefix + '_' + current_date + '_' + 'label_version_suffix' + '.joblib'
filepath = os.path.join(model_bank_directory, filename)
joblib.dump(best_model, filepath)

In [55]:
results = {
    'run_date': [current_date],
    'model_type': ['Logistic Regression'],
    'label_version' : ['label_version'],
    'model_params': [random_search.best_params_],
    'train_X_path': [training_filepath_X],
    'train_Y_path': [training_filepath_Y],
    'val_X_path': [validation_filepath_X],
    'val_Y_path': [validation_filepath_Y],
    'test_X_path': [testing_filepath_X],
    'test_Y_path': [testing_filepath_Y],
    'oot_X_path': [oot_filepath_X],
    'oot_Y_path': [oot_filepath_Y],
    'train_recall' : [random_search.best_score_],
    'train_f1' : [train_f1_score],
    'train_gini' : [round(2*train_auc_score-1,3)],
    'test_recall' : [test_recall_score],
    'test_f1' : [test_f1_score],
    'test_gini' : [round(2*test_auc_score-1,3)],
    'oot_recall' : [oot_recall_score],
    'oot_f1' : [oot_f1_score],
    'oot_gini' : [round(2*oot_auc_score-1,3)]
}

df_results = pd.DataFrame(results)

In [56]:
df_results

Unnamed: 0,run_date,model_type,label_version,model_params,train_X_path,train_Y_path,val_X_path,val_Y_path,test_X_path,test_Y_path,...,oot_Y_path,train_recall,train_f1,train_gini,test_recall,test_f1,test_gini,oot_recall,oot_f1,oot_gini
0,2023-11-01,Logistic Regression,label_version,"{'penalty': 'l1', 'max_iter': 100, 'C': 0.001}",/app/datamart/gold/train_view/gold_train_2023-...,/app/datamart/gold/train_view/gold_train_2023-...,/app/datamart/gold/validation_view/gold_valid_...,/app/datamart/gold/validation_view/gold_valid_...,/app/datamart/gold/test_view/gold_test_2023-11...,/app/datamart/gold/test_view/gold_test_2023-11...,...,/app/datamart/gold/oot_view/gold_oot_2023-11-0...,0.702222,0.528276,0.31,0.745763,0.575163,0.403,0.794393,0.550162,0.348


In [57]:
all_results = df_results
all_results = pd.concat([all_results, df_results], axis=0, ignore_index=True)

In [58]:
all_results

Unnamed: 0,run_date,model_type,label_version,model_params,train_X_path,train_Y_path,val_X_path,val_Y_path,test_X_path,test_Y_path,...,oot_Y_path,train_recall,train_f1,train_gini,test_recall,test_f1,test_gini,oot_recall,oot_f1,oot_gini
0,2023-11-01,Logistic Regression,label_version,"{'penalty': 'l1', 'max_iter': 100, 'C': 0.001}",/app/datamart/gold/train_view/gold_train_2023-...,/app/datamart/gold/train_view/gold_train_2023-...,/app/datamart/gold/validation_view/gold_valid_...,/app/datamart/gold/validation_view/gold_valid_...,/app/datamart/gold/test_view/gold_test_2023-11...,/app/datamart/gold/test_view/gold_test_2023-11...,...,/app/datamart/gold/oot_view/gold_oot_2023-11-0...,0.702222,0.528276,0.31,0.745763,0.575163,0.403,0.794393,0.550162,0.348
1,2023-11-01,Logistic Regression,label_version,"{'penalty': 'l1', 'max_iter': 100, 'C': 0.001}",/app/datamart/gold/train_view/gold_train_2023-...,/app/datamart/gold/train_view/gold_train_2023-...,/app/datamart/gold/validation_view/gold_valid_...,/app/datamart/gold/validation_view/gold_valid_...,/app/datamart/gold/test_view/gold_test_2023-11...,/app/datamart/gold/test_view/gold_test_2023-11...,...,/app/datamart/gold/oot_view/gold_oot_2023-11-0...,0.702222,0.528276,0.31,0.745763,0.575163,0.403,0.794393,0.550162,0.348


In [117]:
import xgboost as xgb

In [121]:
xgb_clf = xgb.XGBClassifier(n_jobs=-1, random_state=42)

y = df_train_val_Y.values.ravel()
scale_pos_weight = np.sum(y == 0) / np.sum(y == 1)

# Hyperparameter space
param_distributions = {
    'n_estimators': [25, 50, 75, 100, 200],
    'scale_pos_weight': [scale_pos_weight, scale_pos_weight*2, scale_pos_weight*3, scale_pos_weight*4, scale_pos_weight*0.7, scale_pos_weight*0.5, scale_pos_weight*0.2],
    'max_depth': [2, 3, 5, 7, 9, 15, 20],  # lower max_depth to simplify the model
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.5, 1, 3, 5],
    'min_child_weight': [1, 3, 5, 7, 9],
    'reg_alpha': [0, 0.1, 0.5, 0.7, 1],
    'reg_lambda': [0.1, 0.5, 1, 1.5, 2, 5]
}

# Set up the random search with cross-validation
xgb_clf_random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_distributions,
    scoring=metrics.make_scorer(metrics.f1_score),
    n_iter=100,  # Number of iterations for random search
    cv=5,       # Number of folds in cross-validation
    random_state=42,
    n_jobs=-1
)

In [122]:
xgb_clf_random_search.fit(df_train_val_X, df_train_val_Y.values.ravel())

0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': [0.6, 0.7, ...], 'gamma': [0, 0.1, ...], 'learning_rate': [0.01, 0.05, ...], 'max_depth': [2, 3, ...], ...}"
,n_iter,100
,scoring,make_scorer(f...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6
,device,
,early_stopping_rounds,
,enable_categorical,False


In [120]:
# Output the best parameters and best score
print("Logistic Regression: Best parameters: ", xgb_clf_random_search.best_params_)
print("Logistic Regression: Best Recall: ", xgb_clf_random_search.best_score_)

# Evaluate the model on the train set
best_model = xgb_clf_random_search.best_estimator_

y_pred = best_model.predict(df_train_val_X)
train_f1_score = metrics.f1_score(df_train_val_Y.values.ravel(), y_pred)
train_auc_score = metrics.roc_auc_score(df_train_val_Y.values.ravel(), y_pred)
print("Logistic Regression: Train Recall score: ", xgb_clf_random_search.best_score_)
print("Logistic Regression: Train F1 score: ", train_f1_score)
print("Logistic Regression: Train GINI score: ", round(2*train_auc_score-1,3))


y_pred = best_model.predict(df_test_X)
test_recall_score = metrics.recall_score(df_test_Y.values.ravel(), y_pred)
test_f1_score = metrics.f1_score(df_test_Y.values.ravel(), y_pred)
test_auc_score = metrics.roc_auc_score(df_test_Y.values.ravel(), y_pred)
print("Logistic Regression: Test Recall score: ", test_recall_score)
print("Logistic Regression: Test F1 score: ", test_f1_score)
print("Logistic Regression: Test GINI score: ", round(2*test_auc_score-1,3))

y_pred = best_model.predict(df_oot_X)
oot_recall_score = metrics.recall_score(df_oot_Y.values.ravel(), y_pred)
oot_f1_score = metrics.f1_score(df_oot_Y.values.ravel(), y_pred)
oot_auc_score = metrics.roc_auc_score(df_oot_Y.values.ravel(), y_pred)
print("Logistic Regression: OOT Recall score: ", oot_recall_score)
print("Logistic Regression: OOT F1 score: ", oot_f1_score)
print("Logistic Regression: OOT GINI score: ", round(2*oot_auc_score-1,3))

Logistic Regression: Best parameters:  {'subsample': 0.8, 'scale_pos_weight': 2.449438202247191, 'reg_lambda': 0.1, 'reg_alpha': 0, 'n_estimators': 25, 'min_child_weight': 9, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 1, 'colsample_bytree': 0.6}
Logistic Regression: Best Recall:  0.6094855173305744
Logistic Regression: Train Recall score:  0.6094855173305744
Logistic Regression: Train F1 score:  0.6170940170940171
Logistic Regression: Train GINI score:  0.466
Logistic Regression: Test Recall score:  0.7627118644067796
Logistic Regression: Test F1 score:  0.6870229007633588
Logistic Regression: Test GINI score:  0.578
Logistic Regression: OOT Recall score:  0.7476635514018691
Logistic Regression: OOT F1 score:  0.6694560669456067
Logistic Regression: OOT GINI score:  0.549


In [80]:
from sklearn.svm import SVC

In [92]:
svc_clf = SVC(max_iter=100000, class_weight='balanced', gamma='auto', probability=True, random_state=42)

# Hyperparameter space
param_distributions = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4, 5],
    'coef0': [0, 0.01, 0.1, 0.3, 0.5, 0.7, 1]
}

# Set up the random search with cross-validation
svm_clf_random_search = RandomizedSearchCV(
    estimator=svc_clf,
    param_distributions=param_distributions,
    scoring=metrics.make_scorer(metrics.recall_score),
    n_iter=100,  # Number of iterations for random search
    cv=5,       # Number of folds in cross-validation
    random_state=42,
    n_jobs=-1
)

In [93]:
svm_clf_random_search.fit(df_train_val_X, df_train_val_Y.values.ravel())



0,1,2
,estimator,SVC(class_wei...ndom_state=42)
,param_distributions,"{'C': [0.001, 0.01, ...], 'coef0': [0, 0.01, ...], 'degree': [2, 3, ...], 'kernel': ['linear', 'rbf', ...]}"
,n_iter,100
,scoring,make_scorer(r...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,C,0.1
,kernel,'rbf'
,degree,2
,gamma,'auto'
,coef0,0.01
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [94]:
# Output the best parameters and best score
print("Logistic Regression: Best parameters: ", svm_clf_random_search.best_params_)
print("Logistic Regression: Best Recall: ", svm_clf_random_search.best_score_)

# Evaluate the model on the train set
best_model = svm_clf_random_search.best_estimator_

y_pred = best_model.predict(df_train_val_X)
train_f1_score = metrics.f1_score(df_train_val_Y.values.ravel(), y_pred)
train_auc_score = metrics.roc_auc_score(df_train_val_Y.values.ravel(), y_pred)
print("Logistic Regression: Train Recall score: ", xgb_clf_random_search.best_score_)
print("Logistic Regression: Train F1 score: ", train_f1_score)
print("Logistic Regression: Train GINI score: ", round(2*train_auc_score-1,3))


y_pred = best_model.predict(df_test_X)
test_recall_score = metrics.recall_score(df_test_Y.values.ravel(), y_pred)
test_f1_score = metrics.f1_score(df_test_Y.values.ravel(), y_pred)
test_auc_score = metrics.roc_auc_score(df_test_Y.values.ravel(), y_pred)
print("Logistic Regression: Test Recall score: ", test_recall_score)
print("Logistic Regression: Test F1 score: ", test_f1_score)
print("Logistic Regression: Test GINI score: ", round(2*test_auc_score-1,3))

y_pred = best_model.predict(df_oot_X)
oot_recall_score = metrics.recall_score(df_oot_Y.values.ravel(), y_pred)
oot_f1_score = metrics.f1_score(df_oot_Y.values.ravel(), y_pred)
oot_auc_score = metrics.roc_auc_score(df_oot_Y.values.ravel(), y_pred)
print("Logistic Regression: OOT Recall score: ", oot_recall_score)
print("Logistic Regression: OOT F1 score: ", oot_f1_score)
print("Logistic Regression: OOT GINI score: ", round(2*oot_auc_score-1,3))

Logistic Regression: Best parameters:  {'kernel': 'rbf', 'degree': 2, 'coef0': 0.01, 'C': 0.1}
Logistic Regression: Best Recall:  1.0
Logistic Regression: Train Recall score:  0.5323489010989011
Logistic Regression: Train F1 score:  0.0
Logistic Regression: Train GINI score:  0.0
Logistic Regression: Test Recall score:  0.0
Logistic Regression: Test F1 score:  0.0
Logistic Regression: Test GINI score:  0.0
Logistic Regression: OOT Recall score:  0.0
Logistic Regression: OOT F1 score:  0.0
Logistic Regression: OOT GINI score:  0.0
