# Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


df = pd.read_csv(r'D:\Notes\git local repo\Diabetes Health Indicator\data\diabetesHI\diabetes_012_health_indicators_BRFSS2015.csv')

def sampler_pipeline(df:pd.DataFrame, sampler:str) -> pd.DataFrame:
    X = df.drop('Diabetes_012', axis=1)
    y = df[['Diabetes_012']]
    
    sampling_list = ['No resampling (Original Data)', 'imblearn: RandomUnderSampler', 'imblearn: SMOTE']
    if sampler == sampling_list[0]:
        #return df
        return X, y
    elif sampler == sampling_list[1]:
        ran_down = RandomUnderSampler(random_state=42)
        X_dws, y_dws = ran_down.fit_resample(X,y)
        #return pd.merge(left=X_dws, right=y_dws, left_index=True, right_index=True)
        return X_dws, y_dws
    elif sampler == sampling_list[2]:
        smote_ups = SMOTE(random_state=42)
        X_ups, y_ups = smote_ups.fit_resample(X, y)
        #return pd.merge(left=X_ups, right=y_ups, left_index=True, right_index=True)
        return X_ups, y_ups
    else:
        raise IndexError("Sampler is not on the pre-defined Sampling List: ['No resampling (Original Data)', 'imblearn: RandomUnderSampler', 'imblearn: SMOTE']")
    

X, y = sampler_pipeline(df = df, sampler='imblearn: RandomUnderSampler')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Load Model From MLflow UI

In [2]:
import mlflow.sklearn

uri = "http://localhost:5000"
mlflow.set_tracking_uri(uri)

In [3]:
model_uri = 'runs:/1e3489dcb6224eba98e3cee11f078a0a/model_gbc_optuna_2'
#gbc_optuna_model = mlflow.sklearn.load_model(model_uri=model_uri)

#pyfunc_path = "/tmp/gbc_dynamic"
gbc_dynamic = mlflow.pyfunc.load_model(model_uri)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

 - cloudpickle (current: 2.2.1, required: cloudpickle==3.0.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [4]:
from sklearn.metrics import accuracy_score

y_pred = gbc_dynamic.predict(X_test)
asc = accuracy_score(y_true=y_test.values.ravel(), y_pred=y_pred)
print(f'Accuracy Score: {asc:.2%}')

Accuracy Score: 53.33%


# Tune Another Model Using Ray

In [5]:
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, f1_score, recall_score, precision_score
from ray.air.integrations.mlflow import MLflowLoggerCallback, setup_mlflow

In [6]:
def train_gbc(config: dict, X_train: pd.DataFrame, y_train: pd.DataFrame):
    X_train, y_train = X_train, y_train

    gbc_clf = GradientBoostingClassifier(
        learning_rate=config['learning_rate'],
        max_depth=config['max_depth'],
        max_features=config['max_features'],
        min_samples_leaf=config['min_samples_leaf'],
        min_samples_split=config['min_samples_split'],
        n_estimators=config['n_estimators'],
        subsample=config['subsample']
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True)
    y_pred_proba = cross_val_predict(estimator=gbc_clf, 
                                   X= X_train, 
                                   y = y_train.values.ravel(), 
                                   cv=skf, 
                                   method='predict_proba')
    
    log_loss_score = log_loss(y_true=y_train, y_pred=y_pred_proba)
    y_pred_class = y_pred_proba.argmax(axis=1)

    metrics = {
            'log_loss' : log_loss_score,
            'accuracy': accuracy_score(y_train, y_pred_class),
            'f1_score': f1_score(y_train, y_pred_class, average='macro'),
            'recall_score': recall_score(y_train, y_pred_class, average='macro'),
            'precision': precision_score(y_train, y_pred_class, average='macro')
        }

    train.report(
        metrics=metrics
    )

In [16]:
search_space = {
            'learning_rate': tune.uniform(0.001, 0.1),
            'max_depth': tune.randint(1, 15),
            'max_features': tune.uniform(0, 1),
            'min_samples_leaf': tune.randint(10, 15),
            'min_samples_split': tune.randint(15, 18),
            'n_estimators':  tune.randint(300, 500),
            'subsample':  tune.uniform(0.1, 0.9)
        }

search_algo = OptunaSearch()

scheduler = ASHAScheduler(
    max_t=100,
    grace_period=10,
    reduction_factor=2
)

In [17]:
train_gbc_gpu = tune.with_resources(train_gbc, {'cpu':4, 'gpu':1})

def tune_to_mlflow(mlflow_uri=uri):
    mlflow_uri=mlflow_uri
    tuner = tune.Tuner(
        tune.with_parameters(trainable=train_gbc, X_train=X_train, y_train=y_train),
        param_space=search_space,
        tune_config=tune.TuneConfig(
            num_samples=20, #10
            metric='log_loss', 
            mode='min',
            search_alg=search_algo,
            scheduler=scheduler),
        run_config=train.RunConfig(
            name="gradient_boosting_tuning",
            callbacks=[
                MLflowLoggerCallback(
                    tracking_uri=mlflow_uri,
                    experiment_name='DHI_Classifiers_Ray',
                    save_artifact=True,
                    #tags={"mlflow.runName":"optuna_asha"}
                )
            ]
        )
    )

    results = tuner.fit()

In [None]:
#tune_to_mlflow(mlflow_uri=uri)

2024/11/21 12:06:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run train_gbc_d3f56636 at: http://localhost:5000/#/experiments/820879946420798051/runs/012eacb8235c446f80e119e176e8aa5a.
2024/11/21 12:06:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/820879946420798051.
2024/11/21 12:08:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run train_gbc_bb82701d at: http://localhost:5000/#/experiments/820879946420798051/runs/db8667ff69404fda96c7ce8f811445e4.
2024/11/21 12:08:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/820879946420798051.
2024/11/21 12:08:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run train_gbc_dd2fd91d at: http://localhost:5000/#/experiments/820879946420798051/runs/38788704f4c043be87c2f51dfbbe7bdf.
2024/11/21 12:08:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/

In [None]:
ray_model_uri = 'runs:/1e3489dcb6224eba98e3cee11f078a0a/model_gbc_optuna_2'
#gbc_optuna_model = mlflow.sklearn.load_model(model_uri=model_uri)

#pyfunc_path = "/tmp/gbc_dynamic"
gbc_rt = mlflow.pyfunc.load_model(ray_model_uri)
