In [1]:
from preprocess_income_data import preprocess_data 

X_train, X_test, X_val, y_train, y_val, y_test = preprocess_data()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from xgboost import XGBClassifier
import numpy as np
from const import N_SPLITS, RANDOM_STATE, XGB, XGB_ID

ALGORITHM_NAME = XGB
ALGORITHM_ID = XGB_ID

kf = KFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

learning_rate = [0.01, 0.1, 0.5, 0.9]

max_depth = [2, 3, 4, 5, 7, 8, 9, 10, 15]

subsample = [0.3, 0.5, 0.9]

steps = [(ALGORITHM_NAME, XGBClassifier(
    seed=RANDOM_STATE,
    objective='binary:logistic'
))]

pipeline = Pipeline(steps)

parameters = {
    f"{ALGORITHM_NAME}__learning_rate": learning_rate,
    f"{ALGORITHM_NAME}__max_depth": max_depth,
    f"{ALGORITHM_NAME}__subsample": subsample,
}
cv = GridSearchCV(pipeline, param_grid=parameters,
                  cv=kf, scoring="accuracy", verbose=3)

cv.fit(X_train, y_train)


In [3]:
cv.best_score_

0.871533475538925

In [4]:
from process_cv_results import process_cv_scores, process_best_params, get_classification_metrics, results_to_csv
accuracy_scores_df = process_cv_scores(cv, algorithm_id=ALGORITHM_ID)
accuracy_scores_df.head()


Unnamed: 0,algorithm_id,accuracy_score
0,5,0.844205
1,5,0.844389
2,5,0.844389
3,5,0.844942
4,5,0.844758


In [5]:
best_params_df = process_best_params(
    cv, algorithm_id=ALGORITHM_ID, algorithm_name=ALGORITHM_NAME)
best_params_df.head()


Unnamed: 0,algorithm_id,hyperparameter_name,best_value
0,5,learning_rate,0.1
1,5,max_depth,8.0
2,5,subsample,0.9


In [6]:
y_pred = cv.predict(X_test)
classification_metrics_df = get_classification_metrics(
    y_test, y_pred, cv, ALGORITHM_ID, ALGORITHM_NAME)

classification_metrics_df.head()


[[5874  370]
 [ 664 1232]]

---


Unnamed: 0,algorithm_id,algorithm_name,best_accuracy,precision,recall,roc_auc_score,true_positives,false_positives,false_negatives,true_negatives
0,5,xgboost,0.871533,0.769039,0.649789,0.795266,5874,370,664,1232


In [7]:
dfs = [accuracy_scores_df, best_params_df, classification_metrics_df]

results_to_csv(dfs, ALGORITHM_ID)
