In [134]:
import importlib
import src
import os
import json
import pickle
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from typing import Optional, Union
from src.utils import fetch_data
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import TargetEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
importlib.reload(src.utils)

<module 'src.utils' from 'f:\\data science\\ml projects\\ml project by engineering wala bhaiya\\ml_pipeline_project\\src\\utils.py'>

## Function

### 1. Model Pipeline

In [101]:
def model_pipelines(model_name: str, model):
    with open('config/data_config/transform_features.json', 'r') as json_file:
        transform_features = json.load(json_file)
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), transform_features['target_features']),
    ],
    remainder='passthrough'
    )

    model_pipeline = Pipeline(
        steps=[
            ('target', preprocessor),
            (model_name, model(n_jobs=-1))
        ]
    )
    return model_pipeline

### 2. Baseline Model

In [106]:
def baseline_model(X_train, y_train, model_name: str, model, scores):
    model_pipeline = model_pipelines(model_name, model, random_state=42)
    model_pipeline.fit(X_train, np.ravel(y_train))
    return model_pipeline

### 3. Tuned Model

In [127]:
def train_model(
    X_train,
    y_train,
    model_name: str,
    model,
    scoring: list,
    n_iter: int,
    param_distributions: Union[dict, list],
    **kwagr):
    model_pipeline = model_pipelines(model_name, model, **kwagr)

    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    random_best_search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring=scoring,
        cv=cv,
        refit='f1',
        verbose=1,
        n_jobs=-1,
        return_train_score=True
    )
    return random_best_search.fit(X_train, np.ravel(y_train))

### 4. Measuring Scores

In [33]:
def measure_scores(estimator, X_test, y_test):

  y_pred = estimator.predict(X_test)

  metrics = {
      'accuracy': accuracy_score,
      'balanced_accuracy': balanced_accuracy_score,
      'precision': precision_score,
      'recall': recall_score,
      'f1': f1_score,
      'roc_auc': roc_auc_score
  }

  for metric_name, metric_func in metrics.items():
      if metric_name == 'roc_auc':
          score = metric_func(y_test, estimator.predict_proba(X_test)[:, 1])
      else:
          score = metric_func(y_test, y_pred)
      print(f"{metric_name}: {score}")


### 5. Model Cross Validation Scores

In [89]:
def model_cv_scores(
    results: dict,
    scores: list,
    save_json: bool = False,
    save_path: Optional[str] = None,
    name: Optional[str] = None,
    show_test_scores: bool = True,
    show_train_scores: bool =  False
    )-> dict:
    
    if save_json and (save_path is None or name is None):
        raise ValueError("save_path and name must be provided if save_json is True")
    
    model_test_scores = []
    model_train_scores = []
    
    for score in scores:
        model_test_scores.append(f'mean_test_{score}')
        model_test_scores.append(f'std_test_{score}')
        
    for score in scores:
        model_train_scores.append(f'mean_train_{score}')
        model_train_scores.append(f'std_train_{score}')
    
    keys = []
    if show_test_scores:
        keys.extend(model_test_scores)
    if show_train_scores:
        keys.extend(model_train_scores)
    if not keys:
        raise ValueError('You must show either test or train scores')
    
    cv_scores = {key: np.average(results[key]) for key in keys}
    
    if save_json:
        save_cv_scores = {key: np.average(results[key]) for key in model_test_scores + model_train_scores}
        with open(f"{save_path}/{name}", 'w') as json_file:
            json.dump(save_cv_scores, json_file, indent=4)
    
    return cv_scores

### 6. Save Model

In [None]:
def save_model(estimator, scores: dict, path: str, basic_name: str):
  pr_scores = {key: round(scores[key], 6) for key in scores.keys() if key in ['precision', 'recall']}
  model_name = basic_name + f"_p{pr_scores['precision']}_r{pr_scores['recall']}"
  with open(f"{path}/{model_name}.pkl", 'wb') as f:
    joblib.dump(estimator, f)

### 7. Save Trained Pipeline

In [107]:
def save_trained_pipeline(trained_pipeline, path: str, name: str):
  with open(f"{path}/{name}.pkl", 'wb') as f:
    joblib.dump(trained_pipeline, f)

### 8. Save Model Parameters

In [108]:
def save_model_params(params: dict, scores: dict, path: str, basic_name: str):
  pr_scores = {key: round(scores[key], 6) for key in scores.keys() if key in ['precision', 'recall']}
  name = basic_name + f"_p{pr_scores['precision']}_r{pr_scores['recall']}"
  with open(f"{path}/{name}.json", 'w') as json_file:
    json.dump(params, json_file, indent=4)

In [124]:
def save_model_configs(
    estimator,
    trained_pipeline,
    cv_results: dict,
    params:dict,
    scores: dict,
    model_save_path: str,
    model_name: str,
    pipeline_save_path: str,
    pipeline_name: str,
    model_params_save_path: str,
    model_params_name: str,
    cv_scores_save_path: str,
    cv_scores_name: str,
    ):
  save_model(
      estimator=estimator,
      scores=scores,
      path=model_save_path,
      basic_name=model_name
      )
  
  save_trained_pipeline(
      trained_pipeline=trained_pipeline,
      path=pipeline_save_path,
      name=pipeline_name
      )
  
  save_model_params(
      params=params,
      scores=scores,
      path=model_params_save_path,
      basic_name=model_params_name
      )
  
  cv_scores = model_cv_scores(
      results=cv_results,
      scores=scores,
      save_json=True,
      save_path=cv_scores_save_path,
      name=cv_scores_name
      )
  
  for key, value in cv_scores.items():
    print(key.replace('_', ' ').title(), ':', value)

## Fetch Data From the Sourse

In [128]:
X_train_transformed = fetch_data(FILE_NAME='X_train_transformed.csv', DIRECTORY_NAME='featured')
X_test_transformed = fetch_data(FILE_NAME='X_test_transformed.csv', DIRECTORY_NAME='featured')
X_train_simple_transformed = fetch_data(FILE_NAME='X_train_simple_transformed.csv', DIRECTORY_NAME='processed')
X_test_simple_transformed = fetch_data(FILE_NAME='X_test_simple_transformed.csv', DIRECTORY_NAME='processed')
y_train = fetch_data(FILE_NAME='y_train_transformed.csv', DIRECTORY_NAME='featured')
y_test = fetch_data(FILE_NAME='y_test_transformed.csv', DIRECTORY_NAME='featured')

## Model Training

In [104]:
scores = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc']

### 1. Random Forest

#### 1. Baseline Random Forest
This section will only based on the `base-line models`, where we go through different sets which involved newly extracted `featured set` and only simple `processed set` of income dataset.

##### 1. Baseline Random Forest on Featured Set

In [None]:
baseline_model_featured = baseline_model(X_train_transformed, y_train, 'random_forest', RandomForestClassifier, scores)
model_scores = measure_scores(baseline_model_featured, X_test_transformed, y_test)

In [None]:
model_save_path = 'models/random_forest/random_forest_featured_set'
save_model(
    estimator=baseline_model_featured,
    scores=model_scores,
    path=model_save_path,
    basic_name='random_forest_baseline'
    )

##### 2. BaseLine Random Forest on Simple Processed Set

In [None]:
baseline_model_featured = baseline_model(X_train_simple_transformed, y_train, 'random_forest', RandomForestClassifier, scores)
model_scores = measure_scores(baseline_model_featured, X_test_transformed, y_test)
model_scores

In [None]:
model_save_path = 'models/random_forest/random_forest_simple_featured_set'
save_model(
    estimator=baseline_model_featured,
    scores=model_scores,
    path=model_save_path,
    basic_name='random_forest_baseline'
    )

#### 2. Random Forest With Hyper Parameter Tuning
This section is based on the model with hyper tuned parameters both on the newly `extracted featured set` and simple `transformed featured set` of income dataset. 

##### 1. Tuned Random Forest on Featured Set

In [None]:
#    ***** Do not suppose to run this code onto this system, cause it's been trained on Google Colab *****

with open('config/model_config/random_forest_params.json', 'r') as json_file:
    random_forest_param_distributions = json.load(json_file)


random_search = train_model(
    X_train_transformed,
    y_train,
    'random_forest',
    RandomForestClassifier,
    scores,
    n_iter=30,
    params_path=random_forest_param_distributions
    )

In [118]:
with open('models/random_forest/random_forest_featured_set/random_forest_tuned_p0.791045_r0.617304.pkl', 'rb') as estemator:
    random_forest_tuned = joblib.load(estemator)

model_scores = measure_scores(random_forest_tuned, X_test_transformed, y_test)
model_scores

accuracy: 0.865711361310133
balanced_accuracy: 0.7820435033835814
precision: 0.7910447761194029
recall: 0.6173044925124792
f1: 0.6934579439252336
roc_auc: 0.9210946051991363


In [None]:
#    ***** Do not suppose to run this code onto this system, cause it's been trained on Google Colab *****

model_save_path = 'models/random_forest/random_forest_featured_set'
save_model(
    estimator=random_search.best_estimator_,
    scores=model_scores,
    path=model_save_path,
    basic_name='random_forest_tuned'
    )

In [None]:
#    ***** Do not suppose to run this code onto this system, cause it's been trained on Google Colab *****

pipeline_save_path = 'models/random_forest/random_forest_featured_set/random_forest_trained_pipeline'
save_trained_pipeline(
    trained_pipeline=random_search.best_estimator_,
    path=pipeline_save_path,
    name='random_forest_trained_pipeline'
)

In [None]:
#    ***** Do not suppose to run this code onto this system, cause it's been trained on Google Colab *****

model_params_save_path = 'config/model_config'
save_model_params(
    params=random_search.best_params_,
    scores=model_scores,
    path=model_params_save_path,
    basic_name='random_forest_tuned_params'
)

In [None]:
#    ***** Do not suppose to run this code onto this system, cause it's been trained on Google Colab *****

cv_scores_save_path = 'models/random_forest/random_forest_featured_set/random_forest_trained_pipeline'
cv_scores = model_cv_scores(
    results=random_search.cv_results_,
    scores=scores,
    save_json=True,
    save_path=cv_scores_save_path,
    name='random_forest_tuned_cv_scores.json',
    show_test_scores=True,
)

cv_scores

In [123]:
cv_scores_save_path = 'models/random_forest/random_forest_featured_set/random_forest_trained_tuned_pipeline'
with open(f'{cv_scores_save_path}/random_forest_trained_pipeline.pkl', 'rb') as f:
    trained_pipeline = joblib.load(f)
model_cv_scores(results=trained_pipeline.cv_results_, scores=scores)

{'mean_test_accuracy': 0.8603282099390548,
 'std_test_accuracy': 0.004069708330945389,
 'mean_test_balanced_accuracy': 0.7702609637927857,
 'std_test_balanced_accuracy': 0.00858864047247017,
 'mean_test_precision': 0.7697937247792505,
 'std_test_precision': 0.008452635427149257,
 'mean_test_recall': 0.5971331768570057,
 'std_test_recall': 0.01811029719303307,
 'mean_test_f1': 0.6719312940360602,
 'std_test_f1': 0.01272904679431401,
 'mean_test_roc_auc': 0.9138426382291606,
 'std_test_roc_auc': 0.0049910019846350195}

##### 2. Random Forest With Hyper Parameter Tuning on Simple Processed Set

In [None]:
params_path = 'config/model_config/random_forest_params.json'

random_search = train_model(
    X_train_simple_transformed,
    y_train,
    'random_forest',
    RandomForestClassifier,
    scores,
    n_iter=30,
    params_path=params_path
    )

model_scores = measure_scores(random_search.best_estimator_, X_test_simple_transformed, y_test)
model_scores

### 2. Logistic Regression

#### 1. BaseLine Logistic Regression
This section will only based on the `base-line models`, where we go through different sets which involved newly extracted `featured set` and only simple `processed set` of income dataset.

##### 1. BaseLine Logistic Regression on Featured Set

In [None]:
baseline_model_featured = baseline_model(X_train_transformed, y_train, 'logistic_regression', LogisticRegression, scores)
model_scores = measure_scores(baseline_model_featured, X_test_transformed, y_test)
model_scores

In [None]:
model_save_path = 'models/logistic_regression/logistic_regression_featured_set'
save_model(
    estimator=baseline_model_featured,
    scores=model_scores,
    path=model_save_path,
    basic_name='logistic_regression_baseline'
    )

##### 2. BaseLine Logistic Regression on Simple Processed Set

In [None]:
baseline_model_simple_featured = baseline_model(X_train_simple_transformed, y_train, 'logistic_regression', LogisticRegression, scores)
model_scores = measure_scores(baseline_model_simple_featured, X_test_simple_transformed, y_test)
model_scores

In [None]:
model_save_path = 'models/logistic_regression/logistic_regression_simple_featured_set'
save_model(
    estimator=baseline_model_featured,
    scores=model_scores,
    path=model_save_path,
    basic_name='logistic_regression_baseline'
    )

#### 2. Logistic Regression With Hyper Parameter Tuning
This section is based on the model with hyper tuned parameters both on the newly `extracted featured set` and simple `transformed featured set` of income dataset.

##### 1. Logistic Regression With Hyper Parameter Tuning on Featured Set

In [None]:
path = 'config/model_config/logistic_regression/logistic_regression_params.json'
with open(path, 'r') as json_file:
    logistic_regression_params = json.load(json_file)

random_search = train_model(
    X_train_transformed,
    y_train,
    'logistic_regression',
    LogisticRegression,
    scores,
    n_iter=30,
    param_distributions=logistic_regression_params
    )

model_scores = measure_scores(random_search.best_estimator_, X_test_transformed, y_test)
model_scores

In [None]:
save_model_configs(
    estimator=random_search.best_estimator_,
    trained_pipeline=random_search,
    cv_results=random_search.cv_results_,
    params=random_search.best_params_,
    scores=model_scores,
    model_save_path='models/logistic_regression/logistic_regression_featured_set',
    model_name='logistic_regression_tuned',
    pipeline_save_path='models/logistic_regression/logistic_regression_featured_set/logistic_regression_trained_tuned_pipeline',
    pipeline_name='logistic_regression_trained_pipeline',
    model_params_save_path='config/model_config/logistic_regression/logistic_regression_featured_set',
    model_params_name='logistic_regression_tuned_params',
    cv_scores_save_path='models/logistic_regression/logistic_regression_featured_set/logistic_regression_trained_tuned_pipeline',
    cv_scores_name='logistic_regression_tuned_cv_scores.json'
)

In [131]:
cv_scores_save_path = 'models/logistic_regression/logistic_regression_featured_set/logistic_regression_trained_tuned_pipeline'
with open(f'{cv_scores_save_path}/logistic_regression_trained_pipeline.pkl', 'rb') as f:
    trained_pipeline = joblib.load(f)
model_cv_scores(results=trained_pipeline.cv_results_, scores=scores)

{'mean_test_accuracy': 0.8234658998808505,
 'std_test_accuracy': 0.003784368245920948,
 'mean_test_balanced_accuracy': 0.7575381298212024,
 'std_test_balanced_accuracy': 0.006519037761809717,
 'mean_test_precision': 0.6632689686292229,
 'std_test_precision': 0.008219455479027577,
 'mean_test_recall': 0.6308118783941556,
 'std_test_recall': 0.012782841501366953,
 'mean_test_f1': 0.6284152180950424,
 'std_test_f1': 0.009463117919747244,
 'mean_test_roc_auc': 0.8899037261600657,
 'std_test_roc_auc': 0.005624433968585372}

##### 2. Logistic Regression With Hyper Parameter Tuning on Simple Processed Set

In [None]:
path = 'config/model_config/logistic_regression/logistic_regression_params.json'
with open(path, 'r') as json_file:
    logistic_regression_params = json.load(json_file)

random_search = train_model(
    X_train_simple_transformed,
    y_train,
    'logistic_regression',
    LogisticRegression,
    scores,
    n_iter=150,
    param_distributions=logistic_regression_params
    )

model_scores = measure_scores(random_search.best_estimator_, X_test_transformed, y_test)
model_scores

In [None]:
save_model_configs(
    estimator=random_search.best_estimator_,
    trained_pipeline=random_search,
    cv_results=random_search.cv_results_,
    params=random_search.best_params_,
    scores=model_scores,
    model_save_path='models/logistic_regression/logistic_regression_simple_featured_set',
    model_name='logistic_regression_tuned',
    pipeline_save_path='models/logistic_regression/logistic_regression_simple_featured_set/logistic_regression_trained_tuned_pipeline',
    pipeline_name='logistic_regression_trained_pipeline',
    model_params_save_path='config/model_config/logistic_regression/logistic_regression_simple_featured_set',
    model_params_name='logistic_regression_tuned_params',
    cv_scores_save_path='models/logistic_regression/logistic_regression_simple_featured_set/logistic_regression_trained_tuned_pipeline',
    cv_scores_name='logistic_regression_tuned_cv_scores.json'
)

In [132]:
cv_scores_save_path = 'models/logistic_regression/logistic_regression_simple_featured_set/logistic_regression_trained_tuned_pipeline'
with open(f'{cv_scores_save_path}/logistic_regression_trained_pipeline.pkl', 'rb') as f:
    trained_pipeline = joblib.load(f)
model_cv_scores(results=trained_pipeline.cv_results_, scores=scores)

{'mean_test_accuracy': 0.7995272011078303,
 'std_test_accuracy': 0.003984685473589032,
 'mean_test_balanced_accuracy': 0.7458942876148469,
 'std_test_balanced_accuracy': 0.006630134347271698,
 'mean_test_precision': 0.6158721672186557,
 'std_test_precision': 0.009344181518383178,
 'mean_test_recall': 0.6428008257396293,
 'std_test_recall': 0.013172787657419919,
 'mean_test_f1': 0.5902569457535388,
 'std_test_f1': 0.010061212180905391,
 'mean_test_roc_auc': 0.8703279085617353,
 'std_test_roc_auc': 0.006149963878709868}

### 3. XGBoost Classifier

#### 1. BaseLine XGBoost Classifier
This section will only based on the `base-line models`, where we go through different sets which involved newly extracted `featured set` and only simple `processed set` of income dataset.

##### 1. BaseLine XGBoost on Featured Set

In [None]:
baseline_model_featured = baseline_model(X_train_transformed, y_train, 'logistic_regression', xgb.XGBClassifier, scores)
model_scores = measure_scores(baseline_model_featured, X_test_transformed, y_test)
model_scores

In [None]:
model_save_path = 'models/XGBoost/XGBoost_featured_set'
save_model(
    estimator=baseline_model_featured,
    scores=model_scores,
    path=os.path.join(BASE_PATH, model_save_path),
    basic_name='xgboost_baseline'
    )

##### 2. BaseLine XGBoost on Simple Featured Set

In [None]:
baseline_model_featured = baseline_model(X_train_simple_transformed, y_train, 'logistic_regression', xgb.XGBClassifier, scores)
model_scores = measure_scores(baseline_model_featured, X_test_simple_transformed, y_test)
model_scores

In [None]:
model_save_path = 'models/XGBoost/XGBoost_simple_featured_set'
save_model(
    estimator=baseline_model_featured,
    scores=model_scores,
    path=model_save_path,
    basic_name='xgboost_baseline'
    )

#### 2. XGBoost With Hyper Parameter Tuning
This section is based on the model with hyper tuned parameters both on the newly `extracted featured set` and simple `transformed featured set` of income dataset.

##### 1. XGBoost With Hyper Parameter Tuning on Featured Set

In [None]:
path = 'config/model_config/XGBoost/XGBoost_params.json'
with open(path, 'r') as json_file:
    xgboost_params = json.load(json_file)

random_search = train_model(
    X_train_transformed,
    y_train,
    'xgboost_classifier',
    xgb.XGBClassifier,
    scores,
    n_iter=150,
    param_distributions=xgboost_params
    )

model_scores = measure_scores(random_search.best_estimator_, X_test_transformed, y_test)
model_scores

In [None]:
save_model_configs(
    estimator=random_search.best_estimator_,
    trained_pipeline=random_search,
    cv_results=random_search.cv_results_,
    params=random_search.best_params_,
    scores=model_scores,
    model_save_path='models/XGBoost/XGBoost_featured_set',
    model_name='XGBoost_tuned',
    pipeline_save_path='models/XGBoost/XGBoost_featured_set/XGBoost_trained_tuned_pipeline',
    pipeline_name='XGBoost_trained_pipeline',
    model_params_save_path='config/model_config/XGBoost/XGBoost_featured_set',
    model_params_name='XGBoost_tuned_params',
    cv_scores_save_path='models/XGBoost/XGBoost_featured_set/XGBoost_trained_tuned_pipeline',
    cv_scores_name='XGBoost_tuned_cv_scores.json'
)

In [135]:
cv_scores_save_path = 'models/XGBoost/XGBoost_featured_set/XGBoost_trained_tuned_pipeline'
with open(f'{cv_scores_save_path}/XGBoost_trained_pipeline.pkl', 'rb') as f:
    trained_pipeline = joblib.load(f)
model_cv_scores(results=trained_pipeline.cv_results_, scores=scores)

{'mean_test_accuracy': 0.8531047392632997,
 'std_test_accuracy': 0.005004477909378103,
 'mean_test_balanced_accuracy': 0.8138960687666335,
 'std_test_balanced_accuracy': 0.008925963185869674,
 'mean_test_precision': 0.6825378501441262,
 'std_test_precision': 0.009547344314113942,
 'mean_test_recall': 0.7385285557186527,
 'std_test_recall': 0.018093754620372018,
 'mean_test_f1': 0.7057704611565705,
 'std_test_f1': 0.011623533264370212,
 'mean_test_roc_auc': 0.9187415608657994,
 'std_test_roc_auc': 0.005771131165763514}

##### 2. XGBoost With Hyper Parameter Tuning on Simple Featured Set

In [None]:
path = 'config/model_config/XGBoost/XGBoost_params.json'
with open(path, 'r') as json_file:
    xgboost_params = json.load(json_file)

random_search = train_model(
    X_train_simple_transformed,
    y_train,
    'xgboost_classifier',
    xgb.XGBClassifier,
    scores,
    n_iter=150,
    param_distributions=xgboost_params
    )

model_scores = measure_scores(random_search.best_estimator_, X_test_transformed, y_test)
model_scores

In [None]:
save_model_configs(
    estimator=random_search.best_estimator_,
    trained_pipeline=random_search,
    cv_results=random_search.cv_results_,
    params=random_search.best_params_,
    scores=model_scores,
    model_save_path='models/XGBoost/XGBoost_simple_featured_set',
    model_name='XGBoost_tuned',
    pipeline_save_path='models/XGBoost/XGBoost_simple_featured_set/XGBoost_trained_tuned_pipeline',
    pipeline_name='XGBoost_trained_pipeline',
    model_params_save_path='config/model_config/XGBoost/XGBoost_simple_featured_set',
    model_params_name='XGBoost_tuned_params',
    cv_scores_save_path='models/XGBoost/XGBoost_simple_featured_set/XGBoost_trained_tuned_pipeline',
    cv_scores_name='XGBoost_tuned_cv_scores.json'
)

In [136]:
cv_scores_save_path = 'models/XGBoost/XGBoost_simple_featured_set/XGBoost_trained_tuned_pipeline'
with open(f'{cv_scores_save_path}/XGBoost_trained_pipeline.pkl', 'rb') as f:
    trained_pipeline = joblib.load(f)
model_cv_scores(results=trained_pipeline.cv_results_, scores=scores)

{'mean_test_accuracy': 0.8536053089703163,
 'std_test_accuracy': 0.0052195414127829826,
 'mean_test_balanced_accuracy': 0.8173201620281749,
 'std_test_balanced_accuracy': 0.008946568047939214,
 'mean_test_precision': 0.680354148659299,
 'std_test_precision': 0.009617007067388925,
 'mean_test_recall': 0.7475721898450255,
 'std_test_recall': 0.017590401174884657,
 'mean_test_f1': 0.7094018599485755,
 'std_test_f1': 0.011668951498662742,
 'mean_test_roc_auc': 0.9201910891661487,
 'std_test_roc_auc': 0.005782584846172538}

## Depreciated Code

In [None]:
# This code is to making json file for parameter grid of the random forest model.
'''
random_forest_param_grid = {
    'n_estimators' : [100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'random_state': [42]
}

with open('config/model_config/random_forest_params.json', 'w') as json_file:
    json_file.write(json.dumps(random_forest_param_grid, indent=4))
'''

'''
# os.chdir(os.path.dirname(os.getcwd()))
with open('config/data_config/transform_features.json', 'r') as json_file:
    transform_features = json.load(json_file)

preprocessor = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), transform_features['target_features']),
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(
    steps=[
        ('target', preprocessor),
        ('random_forest', RandomForestClassifier(verbose=1, n_jobs=-1))
    ]
)

scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# model_pipeline.named_steps['target'].fit_transform(X_train, y_train)[0, :].shape

# cv = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
# scores = cross_validate(model_pipeline, X=X_train, y=y_train, cv=cv, scoring=scoring)

cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
grid_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=random_forest_params_grid,
    n_iter=20,
    scoring=scoring,
    cv=cv,
    refit='roc_auc',
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)
grid_search.fit(X_train, np.ravel(y_train))

print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)
cv_results = grid_search.cv_results_
print(cv_results)

# Get the best estimator
best_estimator = grid_search.best_estimator_

# Predict on the test set
y_pred = best_estimator.predict(X_test)

# Calculate and print all metrics
metrics = {
    'accuracy': accuracy_score,
    'balanced_accuracy': balanced_accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score,
    'roc_auc': roc_auc_score
}

for metric_name, metric_func in metrics.items():
    if metric_name == 'roc_auc':
        score = metric_func(y_test, best_estimator.predict_proba(X_test)[:, 1])
    else:
        score = metric_func(y_test, y_pred)
    print(f"{metric_name}: {score}")

for key in list(scores.keys())[2:]:
    print(f"Average {key.replace('_', ' ').title()}: {scores[key].mean()}")
'''

# This code is use to make the parameter distribution of logistic regression
'''
logistic_regression_params_dist = {
    'logistic_regression__penalty': ['l1', 'l2', 'elasticnet'],
    'logistic_regression__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logistic_regression__solver': ['liblinear', 'lbfgs', 'saga'],
    'logistic_regression__max_iter': [100, 200, 500],
    'logistic_regression__class_weight': [None, 'balanced', {0: 1, 1: 2}],
    'logistic_regression__tol': [1e-4, 1e-3, 1e-2],
    'logistic_regression__multi_class': ['ovr'],
    'logistic_regression__l1_ratio':{
        "elasticnet": [0.1, 0.5, 0.9]  # Only used if penalty='elasticnet'
    }
}

path = 'config/model_config/logistic_regression/logistic_regression_params_distribution.json'
with open(os.path.join(BASE_PATH, path), 'w') as json_file:
    json.dump(logistic_regression_params_dist, json_file, indent=4)

logistic_regression_param_grid = []
for penalty, solver in zip(
    logistic_regression_params_dist["logistic_regression__penalty"],
    logistic_regression_params_dist["logistic_regression__solver"]):
    params = {
        # "logistic_regression__penalty": [penalty],
        "logistic_regression__C": logistic_regression_params_dist["logistic_regression__C"],
        # "logistic_regression__solver": logistic_regression_params_dist["logistic_regression__solver"],
        "logistic_regression__max_iter": logistic_regression_params_dist["logistic_regression__max_iter"],
        "logistic_regression__class_weight": logistic_regression_params_dist["logistic_regression__class_weight"][:-1],
        "logistic_regression__tol": logistic_regression_params_dist["logistic_regression__tol"],
        "logistic_regression__multi_class": logistic_regression_params_dist["logistic_regression__multi_class"]
    }

    if penalty == "elasticnet":
        params["logistic_regression__l1_ratio"] = logistic_regression_params_dist["logistic_regression__l1_ratio"]["elasticnet"]
    
    if solver == 'lbfgs':
        params['logistic_regression__penalty'] = ['l2']
        params['logistic_regression__solver'] = ['lbfgs']
    elif solver == 'liblinear':
        params['logistic_regression__penalty'] = ['l1', 'l2']
        params['logistic_regression__solver'] = ['liblinear']
    elif solver == 'saga':
        params['logistic_regression__penalty'] = ['elasticnet', 'l1', 'l2']
        params['logistic_regression__solver'] = ['saga']
    
    logistic_regression_param_grid.append(params)
logistic_regression_param_grid

path = 'config/model_config/logistic_regression/logistic_regression_params.json'
with open(os.path.join(BASE_PATH, path), 'w') as json_file:
    json.dump(logistic_regression_param_grid, json_file, indent=4)
'''

# This code is used to defined a hyperparameters on XGBoost for classification problem and then save them into json file
'''
xgboost_params_grid = {
    'xgboost_classifier__objective': ['binary:logistic'],
    'xgboost_classifier__max_depth': [3, 6, 9],
    'xgboost_classifier__learning_rate': [0.01, 0.1, 0.3],
    'xgboost_classifier__n_estimators': [100, 200, 500],
    'xgboost_classifier__subsample': [0.8, 1.0],
    'xgboost_classifier__colsample_bytree': [0.8, 1.0],
    'xgboost_classifier__reg_alpha': [0, 0.1, 1.0],
    'xgboost_classifier__reg_lambda': [0, 0.1, 1.0],
    'xgboost_classifier__scale_pos_weight': [1.5, 1.86, 2.0, 2.5],
    'xgboost_classifier__eval_metric': ['logloss']
}
path = 'config/model_config/XGBoost/XGBoost_params.json'
with open(os.path.join(BASE_PATH, path), 'w') as json_file:
    json.dump(xgboost_params_grid, json_file, indent=4)
'''