In [0]:
import mlflow

## Config

In [0]:
from mlflow.models import ModelConfig
config = ModelConfig(development_config='config.yaml')

# A little python sugar syntax
class DotDict(dict):
    def __getattr__(self, name):
        value = self[name]
        if isinstance(value, dict):
            return DotDict(value)
        return value
      
dconf = DotDict(config.to_dict())

## Data


We will use the QGL column as our continous target, and the state as our classification target. 50% of the QGL observations are non-zero, so it a decent, if not hard to predict target

In [0]:
%sql
SELECT count(*) FROM shm.3w.well_data WHERE QGL > 0

In [0]:
well_numbers = spark.sql("SELECT DISTINCT well_number FROM shm.`3w`.well_data").rdd.map(lambda row: row[0]).collect()
well_numbers[0:3]

In [0]:
( 
  spark.table(f"{dconf.catalog}.{dconf.schema}.{dconf.table}")
  .filter('well_number = 3')
  .limit(10)
  .display()
)

## AutoML

In [0]:
from databricks import automl

In [0]:
mlflow.set_registry_uri("databricks-uc")
mlflow.set_tracking_uri("databricks")

In [0]:
exp_name='/Users/scott.mckean@databricks.com/hydrate_automl'
if mlflow.get_experiment_by_name(exp_name) is None:
    mlflow.create_experiment(name=exp_name)

mlflow.set_experiment(exp_name)
experiment_id = mlflow.get_experiment_by_name(exp_name).experiment_id

In [0]:
mlflow.end_run()

In [0]:
for well_id in well_numbers[5:6]:
    well_df = (
      spark.table(f"{dconf.catalog}.{dconf.schema}.{dconf.table}")
      .filter('well_number = 3')
      .limit(10000)
    )

    summary = automl.regress(
      dataset=well_df,
      target_col='QGL',
      primary_metric='mse',
      exclude_cols='state',
      experiment_dir=exp_name+f"_well_{well_id}",
      exclude_frameworks=['lightgbm','xgboost'],
      time_col='timestamp',
      timeout_minutes=5
    )

    import pickle

    with open('summary.pkl', 'wb') as f:
        pickle.dump(summary, f)

    mlflow.log_artifact('summary.pkl')

    best_model_uri = summary.best_trial.model_path
    mlflow.log_param("best_model_uri", best_model_uri)
    
    unity_model_name = f"{dconf.catalog}.{dconf.schema}.well_{well_id}"

    mlflow.register_model(best_model_uri, unity_model_name)

In [0]:
from databricks import automl