In [1]:
from preprocess_income_data import preprocess_data

X_train, X_test, X_val, y_train, y_val, y_test = preprocess_data()

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from const import N_SPLITS, RANDOM_STATE, DECISION_TREE, DECISION_TREE_ID

ALGORITHM_NAME = DECISION_TREE
ALGORITHM_ID = DECISION_TREE_ID

kf = KFold(n_splits=20, random_state=RANDOM_STATE, shuffle=True)\

criterion = ["gini", "entropy", "log_loss"]

max_depth = [2, 3, 4, 5, 8, 10, 25, None]

min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)

min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)


steps = [(ALGORITHM_NAME, DecisionTreeClassifier(
    random_state=RANDOM_STATE,
))]


pipeline = Pipeline(steps)

parameters = {
    f"{ALGORITHM_NAME}__criterion": criterion,
    f"{ALGORITHM_NAME}__max_depth": max_depth,
    f"{ALGORITHM_NAME}__min_samples_split": min_samples_split,
    f"{ALGORITHM_NAME}__min_samples_leaf": min_samples_leaf,
}
cv = GridSearchCV(pipeline, param_grid=parameters, cv=kf, scoring="accuracy")

cv.fit(X_train, y_train)


In [3]:
cv.best_score_

0.8193541545060882

In [4]:
from process_cv_results import process_cv_scores, process_best_params, get_classification_metrics, results_to_csv
accuracy_scores_df = process_cv_scores(cv, algorithm_id=ALGORITHM_ID)
accuracy_scores_df.head()


Unnamed: 0,algorithm_id,accuracy_score
0,4,0.828729
1,4,0.828729
2,4,0.828729
3,4,0.828729
4,4,0.755064


In [5]:
best_params_df = process_best_params(
    cv, algorithm_id=ALGORITHM_ID, algorithm_name=ALGORITHM_NAME)
best_params_df.head()


Unnamed: 0,algorithm_id,hyperparameter_name,best_value
0,4,criterion,gini
1,4,max_depth,2
2,4,min_samples_leaf,0.1
3,4,min_samples_split,0.1


In [6]:
y_pred = cv.predict(X_test)
classification_metrics_df = get_classification_metrics(
    y_test, y_pred, cv, ALGORITHM_ID, ALGORITHM_NAME)

classification_metrics_df.head()


[[5935  309]
 [1149  747]]

---


Unnamed: 0,algorithm_id,algorithm_name,best_accuracy,precision,recall,roc_auc_score,true_positives,false_positives,false_negatives,true_negatives
0,4,decision_tree,0.819354,0.707386,0.393987,0.67225,5935,309,1149,747


In [7]:
dfs = [accuracy_scores_df, best_params_df, classification_metrics_df]

results_to_csv(dfs, ALGORITHM_ID)
