In [0]:
%pip install --upgrade --quiet mlflow xgboost
%restart_python

## Config

In [0]:
import mlflow
from mlflow.models import ModelConfig
config = ModelConfig(development_config='config.yaml')

# A little python sugar syntax
class DotDict(dict):
    def __getattr__(self, name):
        value = self[name]
        if isinstance(value, dict):
            return DotDict(value)
        return value
      
dconf = DotDict(config.to_dict())

## XGBOOST
We conduct a manual xgboost experiment

In [0]:
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

exp_name='/Users/scott.mckean@databricks.com/experiments/hydrate_prediction_xgboost'

mlflow.set_experiment(exp_name)

In [0]:
%sql
SELECT 
    well_number, 
    SUM(CASE WHEN QGL > 0 THEN 1 ELSE 0 END) AS positive_qgl_count,
    COUNT(*) AS total_row_count
FROM shm.3w.well_data 
GROUP BY well_number
ORDER BY well_number

In [0]:
well = 15
ksamples = 10

well_df = (
    spark.table(f"{dconf.catalog}.{dconf.schema}.{dconf.table}")
    .filter(f'well_number = {well}')
    .limit(ksamples * 1000)
    .toPandas()
)

In [0]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from mlflow.entities import Dataset

train_cols = ['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP']
target_col = 'QGL'

with mlflow.start_run() as run:

    mlflow.xgboost.autolog(
        registered_model_name=f"{dconf.catalog}.{dconf.schema}.xgboost",
        model_format="xgb"
        name=f"well_{well}"
    )

    # Prep Data
    mlflow.log_param('train_cols', train_cols)
    mlflow.log_param('target_col', target_col)

    # MLFLow 3 Data Registry
    data: Dataset = mlflow.data.from_pandas(well_df, name=f"well_{well}_{ksamples}k")
    
    X_train, X_test, y_train, y_test = train_test_split(
        data.df[train_cols], data.df[target_col], test_size=0.2, random_state=42
    )

    # Train
    model = xgb.XGBRegressor(objective='reg:squarederror')
    model.fit(X_train, y_train)

    # Evaluate
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mlflow.log_metric("test_rmse", rmse)

In [0]:
# Create a deployment
from mlflow.deployments import get_deploy_client
client = get_deploy_client("databricks")
client.create_endpoint(
    name='hydrate_xgboost_2',
    config={
        "served_entities": [{
            "entity_name": f"{dconf.catalog}.{dconf.schema}.xgboost",
            "entity_version": 4,
            "workload_size": "Small",
            "scale_to_zero_enabled": True
        }]
    }
)

In [0]:
model = mlflow.xgboost.load_model('models:/shm.3w.xgboost/4')
