In [9]:
import git
git.refresh(r'C:\Program Files\Git\bin\git.exe')

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Data

In [11]:
data_url = r'D:\Notes\git local repo\Diabetes Health Indicator\data\diabetesHI\diabetes_012_health_indicators_BRFSS2015.csv'

def sampler_pipeline(df:pd.DataFrame,label:str ,sampler:str) -> pd.DataFrame:
    X = df.drop(label, axis=1)
    y = df[[label]]
    
    sampling_list = ['No resampling (Original Data)', 'imblearn: RandomUnderSampler', 'imblearn: SMOTE']
    if sampler == sampling_list[0]:
        return X, y
    elif sampler == sampling_list[1]:
        ran_down = RandomUnderSampler(random_state=42)
        X_dws, y_dws = ran_down.fit_resample(X,y)
        return X_dws, y_dws
    elif sampler == sampling_list[2]:
        smote_ups = SMOTE(random_state=42)
        X_ups, y_ups = smote_ups.fit_resample(X, y)
        return X_ups, y_ups
    else:
        raise IndexError("Sampler is not on the pre-defined Sampling List:\
                          ['No resampling (Original Data)', 'imblearn: RandomUnderSampler', 'imblearn: SMOTE']")
    

In [12]:
df = pd.read_csv(data_url)
X, y = sampler_pipeline(df=df,label='Diabetes_012' ,sampler='imblearn: SMOTE')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Tune

In [13]:
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, f1_score, recall_score, precision_score
from ray.air.integrations.mlflow import setup_mlflow, MLflowLoggerCallback

import mlflow.sklearn


In [14]:
search_space = {
            'learning_rate': tune.uniform(0.001, 0.1),
            'max_depth': tune.randint(1, 15),
            'max_features': tune.uniform(0, 1),
            'min_samples_leaf': tune.randint(10, 15),
            'min_samples_split': tune.randint(15, 18),
            'n_estimators':  tune.randint(300, 500),
            'subsample':  tune.uniform(0.1, 0.9)
        }

search_algo = OptunaSearch()
scheduler = ASHAScheduler(
    max_t=100,
    grace_period=10,
    reduction_factor=2
)

artifact_path='model'
exp_name = 'DHI_Classifiers_RayOpt'
uri = "http://localhost:5000"


def train_gbc_sm(config: dict, X_train: pd.DataFrame, y_train: pd.DataFrame):

    X_train, y_train = X_train, y_train
    gbc_clf = GradientBoostingClassifier(
        learning_rate=config['learning_rate'],
        max_depth=config['max_depth'],
        max_features=config['max_features'],
        min_samples_leaf=config['min_samples_leaf'],
        min_samples_split=config['min_samples_split'],
        n_estimators=config['n_estimators'],
        subsample=config['subsample']
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True)
    y_pred_proba = cross_val_predict(estimator=gbc_clf, 
                                   X= X_train, 
                                   y = y_train.values.ravel(), 
                                   cv=skf, 
                                   method='predict_proba')
    
    log_loss_score = log_loss(y_true=y_train, y_pred=y_pred_proba)
    y_pred_class = y_pred_proba.argmax(axis=1)

    metrics = {
            'log_loss' : log_loss_score,
            'accuracy': accuracy_score(y_train, y_pred_class),
            'f1_score': f1_score(y_train, y_pred_class, average='macro'),
            'recall_score': recall_score(y_train, y_pred_class, average='macro'),
            'precision': precision_score(y_train, y_pred_class, average='macro')
        }

    train.report(
        metrics=metrics
    )

    with mlflow.start_run(run_name=exp_name):
            # Log the model
        mlflow.sklearn.log_model(gbc_clf, artifact_path=artifact_path)

train_gbc_gpu_sm = tune.with_resources(train_gbc_sm, {'cpu':4, 'gpu':1})
mlflow_cb = MLflowLoggerCallback(
                    tracking_uri=uri,
                    experiment_name=exp_name,
                    save_artifact=True,
                )

def tune_to_mlflow_sm():

    tuner = tune.Tuner(
        tune.with_parameters(trainable=train_gbc_gpu_sm, X_train=X_train, y_train=y_train),
        param_space=search_space,
        tune_config=tune.TuneConfig(
            num_samples=2, #10
            metric='log_loss', 
            mode='min',
            search_alg=search_algo,
            scheduler=scheduler),
        run_config=train.RunConfig(
            name="gradient_boosting_tuning",
            callbacks=[mlflow_cb]
        )
    )

    results = tuner.fit()

result = tune_to_mlflow_sm()

0,1
Current time:,2024-11-27 20:39:42
Running for:,00:19:15.75
Memory:,12.9/15.9 GiB

Trial name,status,loc,learning_rate,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,subsample
train_gbc_sm_b11afb12,RUNNING,127.0.0.1:36432,0.00651972,7,0.829963,13,16,469,0.365664
train_gbc_sm_b80f7613,PENDING,,0.0299361,4,0.336513,11,15,395,0.711782


2024-11-27 20:39:42,045	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/JCA/ray_results/gradient_boosting_tuning' in 0.0175s.
2024-11-27 20:39:52,200	INFO tune.py:1041 -- Total run time: 1168.11 seconds (1155.73 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="C:/Users/JCA/ray_results/gradient_boosting_tuning", trainable=...)
- train_gbc_sm_b80f7613: FileNotFoundError('Could not fetch metrics for train_gbc_sm_b80f7613: both result.json and progress.csv were not found at C:/Users/JCA/ray_results/gradient_boosting_tuning/train_gbc_sm_b80f7613_2_learning_rate=0.0299,max_depth=4,max_features=0.3365,min_samples_leaf=11,min_samples_split=15,n_estimators_2024-11-27_20-20-33')
