In [None]:
!python --version

In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import mlflow
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

In [3]:
EXPERIMENT_NAME = os.getenv("EXPERIMENT_NAME", "maternal-health-risk")
DATA_PATH = "../data/data.csv"
MIN_AGE = int(os.getenv("MIN_AGE", 13))
MAX_AGE = int(os.getenv("MAX_AGE", 50))

In [4]:
os.environ['AWS_REGION']='eu-west-1'
os.environ['AWS_DEFAULT_REGION']='eu-west-1'
os.environ['AWS_ACCESS_KEY_ID']='admin'
os.environ['AWS_SECRET_ACCESS_KEY']='adminadmin'
os.environ['MLFLOW_S3_ENDPOINT_URL']='http://localhost:9000'

In [5]:
def load_data(filename):
    df = pd.read_csv(filename)
    return df

In [7]:
def prepare_data(df):
    # Age range limitation
    df = df[(df.Age >= MIN_AGE) & (df.Age <= MAX_AGE)]

    # Body temperature conversion from F to C
    df.BodyTemp = df.BodyTemp.apply(lambda temp: (temp - 32)*5 / 9)

    # Sort by risk
    df.sort_values(by='RiskLevel', ascending=True, inplace=True)
    
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]

    # Target variable encoding
    le = LabelEncoder()
    y = le.fit_transform(y)
    integer_mapping = {l: i for i, l in enumerate(le.classes_)}
    print(integer_mapping)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
        
    return X_train, X_val, y_train, y_val

In [8]:
def train_model_xgboost_search(X_train, X_val, y_train, y_val):
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    mlflow.xgboost.autolog()
    
    def objective(params):
        with mlflow.start_run():
            # mlflow.set_tag("model", "xgboost")
            # mlflow.log_params(params)
            booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=100,
                evals=[(valid, 'validation')],
                early_stopping_rounds=50
            )
            y_pred = [round(x) for x in booster.predict(valid)]
            #rmse = mean_squared_error(y_val, y_pred, squared=False)
            #mlflow.log_metric("rmse", rmse)
            accuracy = accuracy_score(y_val, y_pred)
            mlflow.log_metric("accuracy", accuracy)

        return {'loss': -accuracy, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.uniform('max_depth', 1, 20)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }

    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=32,
        trials=Trials()
    )

    return

In [9]:
def train_model_sklearn_search(X_train, X_val, y_train, y_val):
    mlflow.sklearn.autolog()

    def objective(params):
        with mlflow.start_run():
            classifier_type = params['type']
            del params['type']
            if classifier_type == 'svm':
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(**params)
                )
            elif classifier_type == 'rf':
                clf = make_pipeline(
                    StandardScaler(),
                    RandomForestClassifier(**params)
                )
            
            clf.fit(X_train, y_train)
            #y_pred = clf.predict(X_val)
            #rmse = mean_squared_error(y_val, y_pred, squared=False)
            #mlflow.log_metric("rmse", rmse)

            #accuracy = cross_val_score(clf, X_train, y_train).mean()
            accuracy = clf.score(X_val, y_val)
            mlflow.log_metric("accuracy", accuracy)

            # Because fmin() tries to minimize the objective, this function must return the negative accuracy. 
            return {'loss': -accuracy, 'status': STATUS_OK}

        
    search_space = hp.choice('classifier_type', [
        {
            'type': 'svm',
            'C': hp.uniform('SVM_C', 0.5, 15),
            'gamma': hp.uniform('SVM_gamma', 0.05, 15),
            'kernel': hp.choice('kernel', ['linear', 'rbf'])
        },
        {
            'type': 'rf',
            'max_depth': scope.int(hp.uniform('max_depth', 2, 5)),
            'criterion': hp.choice('criterion', ['gini', 'entropy'])
        },
    ])

    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=100,
        trials=Trials()
    )

    return

In [10]:
def register_best_model():
    client = MlflowClient()
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.accuracy DESC"]
    )[0]
    # register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    model_accuracy = round(best_run.data.metrics['accuracy']*100)
    model_details = mlflow.register_model(model_uri=model_uri, name=EXPERIMENT_NAME)
    client.update_registered_model(
      name=model_details.name,
      description=f"Current accuracy: {model_accuracy}%"
    )

In [14]:
#mlflow.set_tracking_uri("mysql+pymysql://user:password@localhost:3306/db")
mlflow.set_tracking_uri("http://localhost:5000")

mlflow.set_experiment(EXPERIMENT_NAME)

data = load_data(DATA_PATH)
X_train, X_val, y_train, y_val = prepare_data(data)

{'high risk': 0, 'low risk': 1, 'mid risk': 2}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.BodyTemp = df.BodyTemp.apply(lambda temp: (temp - 32)*5 / 9)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(by='RiskLevel', ascending=True, inplace=True)


In [26]:
train_model_xgboost_search(X_train, X_val, y_train, y_val)
train_model_sklearn_search(X_train, X_val, y_train, y_val)

  0%|          | 0/32 [00:00<?, ?trial/s, best loss=?]

In [11]:
register_best_model()

Successfully registered model 'maternal-health-risk'.
2022/08/17 12:49:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: maternal-health-risk, version 1
Created version '1' of model 'maternal-health-risk'.


In [10]:
client = MlflowClient()
client.list_registered_models()

[<RegisteredModel: creation_timestamp=1660758751200, description='', last_updated_timestamp=1660758751237, latest_versions=[<ModelVersion: creation_timestamp=1660758751237, current_stage='None', description='', last_updated_timestamp=1660758751237, name='maternal-health-risk', run_id='c83b35c98b1841bfad61f1d2da7fd57b', run_link='', source='s3://maternal-health-risk/1/c83b35c98b1841bfad61f1d2da7fd57b/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>], name='maternal-health-risk', tags={}>]

In [11]:
logged_model = f"models:/{EXPERIMENT_NAME}/latest"

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
test_data = [{
    'Age': 35, 
    'SystolicBP': 140, 
    'DiastolicBP': 70, 
    'BS': 5.0, 
    'BodyTemp': 36.8, 
    'HeartRate': 60
}]
predicted = [round(x) for x in loaded_model.predict(pd.DataFrame(test_data))]
predicted 

[0]

In [12]:
import pickle
loaded_model.predict(pd.DataFrame(test_data))
pickle.dump(loaded_model, open('./model.bin', "wb"))