In [1]:
from preprocess_income_data import preprocess_data

X_train, X_test, X_val, y_train, y_val, y_test = preprocess_data()

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import numpy as np
from const import N_SPLITS, RANDOM_STATE, RANDOM_FOREST, RANDOM_FOREST_ID

ALGORITHM_NAME = RANDOM_FOREST
ALGORITHM_ID = RANDOM_FOREST_ID

kf = KFold(n_splits=20, random_state=RANDOM_STATE, shuffle=True)

n_estimators = [10, 50, 100, 150, 200, 300]

criterion = ["gini", "entropy", "log_loss"]

max_depth = [2, 3, 4, 5, 8, 10, 25, None]

min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)

min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)


steps = [(ALGORITHM_NAME, RandomForestClassifier(
    random_state=RANDOM_STATE,
))]


pipeline = Pipeline(steps)

parameters = {
    f"{ALGORITHM_NAME}__n_estimators": n_estimators,    
    f"{ALGORITHM_NAME}__criterion": criterion,
    f"{ALGORITHM_NAME}__max_depth": max_depth,
    f"{ALGORITHM_NAME}__min_samples_split": min_samples_split,
    f"{ALGORITHM_NAME}__min_samples_leaf": min_samples_leaf,
}
cv = GridSearchCV(pipeline, param_grid=parameters, cv=kf, scoring="accuracy")

cv.fit(X_train, y_train)


In [None]:
cv.best_score_

In [None]:
from process_cv_results import process_cv_scores, process_best_params, get_classification_metrics, results_to_csv
accuracy_scores_df = process_cv_scores(cv, algorithm_id=ALGORITHM_ID)
accuracy_scores_df.head()


In [None]:
best_params_df = process_best_params(
    cv, algorithm_id=ALGORITHM_ID, algorithm_name=ALGORITHM_NAME)
best_params_df.head()


In [None]:
y_pred = cv.predict(X_test)
classification_metrics_df = get_classification_metrics(
    y_test, y_pred, cv, ALGORITHM_ID, ALGORITHM_NAME)

classification_metrics_df.head()


In [None]:
dfs = [accuracy_scores_df, best_params_df, classification_metrics_df]

results_to_csv(dfs, ALGORITHM_ID)
