# Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


df = pd.read_csv(r'D:\Notes\git local repo\Diabetes Health Indicator\data\diabetesHI\diabetes_012_health_indicators_BRFSS2015.csv')

def sampler_pipeline(df:pd.DataFrame, sampler:str) -> pd.DataFrame:
    X = df.drop('Diabetes_012', axis=1)
    y = df[['Diabetes_012']]
    
    sampling_list = ['No resampling (Original Data)', 'imblearn: RandomUnderSampler', 'imblearn: SMOTE']
    if sampler == sampling_list[0]:
        #return df
        return X, y
    elif sampler == sampling_list[1]:
        ran_down = RandomUnderSampler(random_state=42)
        X_dws, y_dws = ran_down.fit_resample(X,y)
        #return pd.merge(left=X_dws, right=y_dws, left_index=True, right_index=True)
        return X_dws, y_dws
    elif sampler == sampling_list[2]:
        smote_ups = SMOTE(random_state=42)
        X_ups, y_ups = smote_ups.fit_resample(X, y)
        #return pd.merge(left=X_ups, right=y_ups, left_index=True, right_index=True)
        return X_ups, y_ups
    else:
        raise IndexError("Sampler is not on the pre-defined Sampling List: ['No resampling (Original Data)', 'imblearn: RandomUnderSampler', 'imblearn: SMOTE']")
    

X, y = sampler_pipeline(df = df, sampler='imblearn: RandomUnderSampler')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Load Model From MLflow UI

In [2]:
import mlflow.sklearn

uri = "http://localhost:5000"
mlflow.set_tracking_uri(uri)

In [3]:
def get_or_create_experiment(experiment_name):
    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

In [4]:
model_uri = 'runs:/1e3489dcb6224eba98e3cee11f078a0a/model_gbc_optuna_2'
#gbc_optuna_model = mlflow.sklearn.load_model(model_uri=model_uri)

#pyfunc_path = "/tmp/gbc_dynamic"
gbc_dynamic = mlflow.pyfunc.load_model(model_uri)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

 - cloudpickle (current: 2.2.1, required: cloudpickle==3.0.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [5]:
from sklearn.metrics import accuracy_score

y_pred = gbc_dynamic.predict(X_test)
asc = accuracy_score(y_true=y_test.values.ravel(), y_pred=y_pred)
print(f'Accuracy Score: {asc:.2%}')

Accuracy Score: 53.33%


# Tune Another Model Using Ray

In [6]:
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, f1_score, recall_score, precision_score
from ray.air.integrations.mlflow import MLflowLoggerCallback, setup_mlflow

### `MLflowLoggerCallback`

In [7]:
search_space = {
            'learning_rate': tune.uniform(0.001, 0.1),
            'max_depth': tune.randint(1, 15),
            'max_features': tune.uniform(0, 1),
            'min_samples_leaf': tune.randint(10, 15),
            'min_samples_split': tune.randint(15, 18),
            'n_estimators':  tune.randint(300, 500),
            'subsample':  tune.uniform(0.1, 0.9)
        }

search_algo = OptunaSearch()

scheduler = ASHAScheduler(
    max_t=100,
    grace_period=10,
    reduction_factor=2
)

artifact_path='model'

def train_gbc_cb(config: dict, X_train: pd.DataFrame, y_train: pd.DataFrame):
    X_train, y_train = X_train, y_train

    gbc_clf = GradientBoostingClassifier(
        learning_rate=config['learning_rate'],
        max_depth=config['max_depth'],
        max_features=config['max_features'],
        min_samples_leaf=config['min_samples_leaf'],
        min_samples_split=config['min_samples_split'],
        n_estimators=config['n_estimators'],
        subsample=config['subsample']
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True)
    y_pred_proba = cross_val_predict(estimator=gbc_clf, 
                                X= X_train, 
                                y = y_train.values.ravel(), 
                                cv=skf, 
                                method='predict_proba')
    
    log_loss_score = log_loss(y_true=y_train, y_pred=y_pred_proba)
    y_pred_class = y_pred_proba.argmax(axis=1)

    metrics = {
            'log_loss' : log_loss_score,
            'accuracy': accuracy_score(y_train, y_pred_class),
            'f1_score': f1_score(y_train, y_pred_class, average='macro'),
            'recall_score': recall_score(y_train, y_pred_class, average='macro'),
            'precision': precision_score(y_train, y_pred_class, average='macro')
        }

    # Does not work on callback
    mlflow.sklearn.log_model(
        gbc_clf, artifact_path=artifact_path
    )

    train.report(
        metrics=metrics
    )


train_gbc_gpu_cb = tune.with_resources(train_gbc_cb, {'cpu':4, 'gpu':1})
def tune_to_mlflow_cb(mlflow_uri=uri):
    """
    Notes:
    `MLflowLoggerCallback()`:
        - cannot create "run groups" inside an experiment
        - cannot log model
    """
    tuner = tune.Tuner(
        tune.with_parameters(trainable=train_gbc_gpu_cb, X_train=X_train, y_train=y_train),
        param_space=search_space,
        tune_config=tune.TuneConfig(
            num_samples=1, #10
            metric='log_loss', 
            mode='min',
            search_alg=search_algo,
            scheduler=scheduler),
        run_config=train.RunConfig(
            name="gradient_boosting_tuning",
            callbacks=[
                MLflowLoggerCallback(
                    tracking_uri=mlflow_uri,
                    experiment_name='DHI_Classifiers_Ray',
                    save_artifact=True,
                    #tags={"mlflow.runName":"optuna_asha"}
                )
            ]
        )
    )

    results = tuner.fit()

tune_to_mlflow_cb(mlflow_uri=uri)

0,1
Current time:,2024-11-27 20:06:45
Running for:,00:02:34.16
Memory:,12.4/15.9 GiB

Trial name,status,loc,learning_rate,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,subsample,iter,total time (s),log_loss,accuracy,f1_score
train_gbc_cb_74d9ee8f,TERMINATED,127.0.0.1:37184,0.0160963,13,0.82092,11,17,494,0.348102,1,145.537,1.03304,0.489203,0.487969


[36m(train_gbc_cb pid=37184)[0m The git executable must be specified in one of the following ways:
[36m(train_gbc_cb pid=37184)[0m     - be included in your $PATH
[36m(train_gbc_cb pid=37184)[0m     - be set via $GIT_PYTHON_GIT_EXECUTABLE
[36m(train_gbc_cb pid=37184)[0m     - explicitly set via git.refresh(<full-path-to-git-executable>)
[36m(train_gbc_cb pid=37184)[0m 
[36m(train_gbc_cb pid=37184)[0m All git commands will error until this is rectified.
[36m(train_gbc_cb pid=37184)[0m 
[36m(train_gbc_cb pid=37184)[0m This initial message can be silenced or aggravated in the future by setting the
[36m(train_gbc_cb pid=37184)[0m $GIT_PYTHON_REFRESH environment variable. Use one of the following values:
[36m(train_gbc_cb pid=37184)[0m     - quiet|q|silence|s|silent|none|n|0: for no message or exception
[36m(train_gbc_cb pid=37184)[0m     - error|e|exception|raise|r|2: for a raised exception
[36m(train_gbc_cb pid=37184)[0m 
[36m(train_gbc_cb pid=37184)[0m Example:


### `setup_mflow()`

In [8]:
search_space = {
            'learning_rate': tune.uniform(0.001, 0.1),
            'max_depth': tune.randint(1, 15),
            'max_features': tune.uniform(0, 1),
            'min_samples_leaf': tune.randint(10, 15),
            'min_samples_split': tune.randint(15, 18),
            'n_estimators':  tune.randint(300, 500),
            'subsample':  tune.uniform(0.1, 0.9)
        }

search_algo = OptunaSearch()

scheduler = ASHAScheduler(
    max_t=100,
    grace_period=10,
    reduction_factor=2
)

aritifact_path = "model"
def train_gbc_sm(config: dict, X_train: pd.DataFrame, y_train: pd.DataFrame):
    """
    Notes:
    `setup_mlflow()`:
        - uploading to an existing experiment works if initially created with `setup_mlflow`
        - creating "run groups" inside an experiment does not work
    """

    setup_mlflow(config)

    mlflow.set_tracking_uri(uri)
    mlflow.set_experiment(experiment_name='DHI_Classifiers_Ray_sm1') #
    mlflow.log_params(config)

    X_train, y_train = X_train, y_train
    gbc_clf = GradientBoostingClassifier(
        learning_rate=config['learning_rate'],
        max_depth=config['max_depth'],
        max_features=config['max_features'],
        min_samples_leaf=config['min_samples_leaf'],
        min_samples_split=config['min_samples_split'],
        n_estimators=config['n_estimators'],
        subsample=config['subsample']
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True)
    y_pred_proba = cross_val_predict(estimator=gbc_clf, 
                                   X= X_train, 
                                   y = y_train.values.ravel(), 
                                   cv=skf, 
                                   method='predict_proba')
    
    log_loss_score = log_loss(y_true=y_train, y_pred=y_pred_proba)
    y_pred_class = y_pred_proba.argmax(axis=1)

    metrics = {
            'log_loss' : log_loss_score,
            'accuracy': accuracy_score(y_train, y_pred_class),
            'f1_score': f1_score(y_train, y_pred_class, average='macro'),
            'recall_score': recall_score(y_train, y_pred_class, average='macro'),
            'precision': precision_score(y_train, y_pred_class, average='macro')
        }

    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(gbc_clf, artifact_path=aritifact_path)

    train.report(
        metrics=metrics
    )


train_gbc_gpu_sm = tune.with_resources(train_gbc_sm, {'cpu':4, 'gpu':1})
def tune_to_mlflow_sm(mlflow_uri=uri):
    mlflow.set_tracking_uri(mlflow_uri)
    mlflow.set_experiment(experiment_name='DHI_Classifiers_Ray_sm1')

    tuner = tune.Tuner(
        tune.with_parameters(trainable=train_gbc_gpu_sm, X_train=X_train, y_train=y_train),
        param_space=search_space,
        tune_config=tune.TuneConfig(
            num_samples=1, #10
            metric='log_loss', 
            mode='min',
            search_alg=search_algo,
            scheduler=scheduler),
        run_config=train.RunConfig(
            name="gradient_boosting_tuning",
        )
    )

    results = tuner.fit()

res2 = tune_to_mlflow_sm(mlflow_uri=uri)

2024-11-27 20:08:01,357	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/JCA/ray_results/gradient_boosting_tuning' in 0.0150s.
2024-11-27 20:08:01,375	INFO tune.py:1041 -- Total run time: 75.49 seconds (75.44 seconds for the tuning loop).


[36m(train_gbc_sm pid=34368)[0m The git executable must be specified in one of the following ways:
[36m(train_gbc_sm pid=34368)[0m     - be included in your $PATH
[36m(train_gbc_sm pid=34368)[0m     - be set via $GIT_PYTHON_GIT_EXECUTABLE
[36m(train_gbc_sm pid=34368)[0m     - explicitly set via git.refresh(<full-path-to-git-executable>)
[36m(train_gbc_sm pid=34368)[0m 
[36m(train_gbc_sm pid=34368)[0m All git commands will error until this is rectified.
[36m(train_gbc_sm pid=34368)[0m 
[36m(train_gbc_sm pid=34368)[0m This initial message can be silenced or aggravated in the future by setting the
[36m(train_gbc_sm pid=34368)[0m $GIT_PYTHON_REFRESH environment variable. Use one of the following values:
[36m(train_gbc_sm pid=34368)[0m     - quiet|q|silence|s|silent|none|n|0: for no message or exception
[36m(train_gbc_sm pid=34368)[0m     - error|e|exception|raise|r|2: for a raised exception
[36m(train_gbc_sm pid=34368)[0m 
[36m(train_gbc_sm pid=34368)[0m Example:


### `mlflow.start_run()`

In [9]:
experiment_id = get_or_create_experiment("DHI_CLF_ray_opt")
mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='mlflow-artifacts:/328477487984719548', creation_time=1732709281478, experiment_id='328477487984719548', last_update_time=1732709281478, lifecycle_stage='active', name='DHI_CLF_ray_opt', tags={}>

In [10]:
search_space = {
            'learning_rate': tune.uniform(0.001, 0.1),
            'max_depth': tune.randint(1, 15),
            'max_features': tune.uniform(0, 1),
            'min_samples_leaf': tune.randint(10, 15),
            'min_samples_split': tune.randint(15, 18),
            'n_estimators':  tune.randint(300, 500),
            'subsample':  tune.uniform(0.1, 0.9)
        }
search_algo = OptunaSearch()
scheduler = ASHAScheduler(
    max_t=100,
    grace_period=10,
    reduction_factor=2
)
aritifact_path = "model"

def train_gbc_mlflow(config: dict, X_train: pd.DataFrame, y_train: pd.DataFrame):
    """
    Notes:
    `mlflow.start_run`
    """
    with mlflow.start_run(experiment_id=experiment_id, run_name='test_01'):
        X_train, y_train = X_train, y_train
        gbc_clf = GradientBoostingClassifier(
            learning_rate=config['learning_rate'],
            max_depth=config['max_depth'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            min_samples_split=config['min_samples_split'],
            n_estimators=config['n_estimators'],
            subsample=config['subsample']
        )

        skf = StratifiedKFold(n_splits=5, shuffle=True)
        y_pred_proba = cross_val_predict(estimator=gbc_clf, 
                                    X= X_train, 
                                    y = y_train.values.ravel(), 
                                    cv=skf, 
                                    method='predict_proba')
        
        log_loss_score = log_loss(y_true=y_train, y_pred=y_pred_proba)
        y_pred_class = y_pred_proba.argmax(axis=1)

        metrics = {
                'log_loss' : log_loss_score,
                'accuracy': accuracy_score(y_train, y_pred_class),
                'f1_score': f1_score(y_train, y_pred_class, average='macro'),
                'recall_score': recall_score(y_train, y_pred_class, average='macro'),
                'precision': precision_score(y_train, y_pred_class, average='macro')
            }

        mlflow.log_params(config)
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(gbc_clf, 
                                 artifact_path=aritifact_path,
                                 input_example=X_train.iloc[[0]])

        train.report(
            metrics=metrics
        )

        return log_loss_score
    
train_gbc_gpu_mlflow = tune.with_resources(train_gbc_mlflow, {'cpu':4, 'gpu':1})
def tune_to_mlflow_mlflow(mlflow_uri=uri):
    mlflow_uri=mlflow_uri
    tuner = tune.Tuner(
        tune.with_parameters(trainable=train_gbc_gpu_mlflow, X_train=X_train, y_train=y_train),
        param_space=search_space,
        tune_config=tune.TuneConfig(
            num_samples=1, #10
            metric='log_loss', 
            mode='min',
            search_alg=search_algo,
            scheduler=scheduler),
        run_config=train.RunConfig(
            name="gradient_boosting_tuning",
        )
    )

    results = tuner.fit()

#tune_to_mlflow_mlflow(uri)