In [0]:
#%run ../utils/nginx_commons

In [0]:
#%run ../data/nginx_data_transform

In [0]:
#%run ../EDE/nginx_write_to_ede

In [0]:
import os.path
import h2o

In [0]:
class NGINXTrainingPipeline():
  """
  Hyper parameter tune
  Refit model using all data
  Saved trained model in MLflow
  In dev: model is saved in experiment
  In qa: model is excalated to stating, which will be evaluated in evaluation_pipeline
  In prod: TBD
  """
  def __init__(self
               , model_name
               , experiment_id
               , env="dev"
               , dv="next_qtr_nginx_purchase"
               , primary_keys=["calendar_date","account_id"]
               , model_path="/dbfs/mlops_nginx"
               , fs_update=True
               , database_prefix="mlops_nginx_example_"
               , training_shap_individual_contributions_table_name="nginx_training_shap_individual_contributions_data"
               , validation_shap_individual_contributions_table_name="nginx_validation_shap_individual_contributions_data"
               , training_permutation_importance_table_name="nginx_training_permutation_importance_data"
               , validation_permutation_importance_table_name="nginx_validation_permutation_importance_data"
               , training_variable_importance_table_name="nginx_training_variable_importance_data"
               , train_data_table_name="nginx_train_modeling_data"
               , validation_data_table_name="nginx_validation_modeling_data"
              ):
    self.model_name = model_name
    self.experiment_id = experiment_id
    self.env = env    # dev, qa, and prod
    self.dv = dv
    self.primary_keys = primary_keys
    self.model_path = model_path
    
    # vars related to feature store
    self.database_prefix = database_prefix
    self.fs_update = fs_update
    self.training_shap_individual_contributions_table_name = training_shap_individual_contributions_table_name
    self.validation_shap_individual_contributions_table_name = validation_shap_individual_contributions_table_name
    self.training_permutation_importance_table_name = training_permutation_importance_table_name
    self.validation_permutation_importance_table_name = validation_permutation_importance_table_name
    self.training_variable_importance_table_name = training_variable_importance_table_name
    self.train_data_table_name = train_data_table_name
    self.validation_data_table_name = validation_data_table_name
    
    # vars related to ede 
    if env == 'dev' or env == 'qa':
      self.ede_schema = f"{self.env}_DATA_SCIENCE"
    elif env == 'prod':
      self.ede_schema = "DATA_SCIENCE"
    self.ede_pipeline = EDEPipeline(ede_schema = self.ede_schema)
    
    
    if(fs_update):
      self.effective_date = str(date.today())  # initialize today as another primary key
#       self.update_feature_store(self.train_dates, self.score_dates)  # update the feature store
    
  def update_feature_store(self, model, training_df, validation_df):
    """
    Update feature store for tables related to individual shap contributions,  permutation variable importance, and (gini) variable importance for training and validation populations
    Input: model object, h2o dataframe
    """
    
    # update training data
    self.update_train_and_validation_fs_table(model=model
                                              , table_name=self.train_data_table_name
                                              , df=training_df
                                              , population_type="training"
                                             )
    
    # update validation data
    self.update_train_and_validation_fs_table(model=model
                                              , table_name=self.validation_data_table_name
                                              , df=validation_df
                                              , population_type="validation"
                                             )
    
    # update shap for training and validation populations in FS and EDE
    self.update_individual_contributions_fs_table(df=training_df
                                                  , model=model
                                                  , table_name=self.training_shap_individual_contributions_table_name
                                                  , population_type="training"
                                                 )
    self.update_individual_contributions_fs_table(df=validation_df
                                                  , model=model
                                                  , table_name=self.validation_shap_individual_contributions_table_name
                                                  , population_type="validation"
                                                 )
    
    # update permutation variable importance for training and validation populations
    self.update_permutation_importance_fs_table(df=training_df
                                                , model=model
                                                , table_name=self.training_permutation_importance_table_name
                                                , population_type="training"
                                               )
    self.update_permutation_importance_fs_table(df=validation_df
                                                , model=model
                                                , table_name=self.validation_permutation_importance_table_name
                                                , population_type="validation"
                                               )
    
    # update variable importance for training populations
    self.update_training_variable_importance_fs_table(model=model
                                                      , table_name=self.training_variable_importance_table_name
                                                     )
  
    

    
  def update_train_and_validation_fs_table(self, model, table_name, df, population_type):
    """Update train and validation tables and tie to the model they're related with"""
    # Set h2o context
    hc = H2OContext.getOrCreate()
    
    # get model_id from model object
    model_id = model.actual_params['model_id'] 
    
    # turn h2o dfs to spark
    df_pys = (hc.asSparkFrame(h2oFrame=df)
              .withColumn("population_type",F.lit(population_type))
              .withColumn("model_id",F.lit(model_id))
              .withColumn("effective_date",F.lit(self.effective_date))
            )
    df_pys = type_to_type(df_pys, 'tinyint', 'double')
    
    # write data to fs
    self.write_data_to_fs(df=df_pys
                          , is_var_imp_fs=False
                          , table_name=f"{self.database_prefix + self.env}.{table_name}"
                         )
    
    # write data to EDE
    if (population_type != "validation"):
      fs = FeatureStoreClient()
      self.ede_pipeline.write_to_ede(df=fs.read_table(f"{self.database_prefix + self.env}.{table_name}").filter(F.col("effective_date")==self.effective_date)
                                     , mode="overwrite"
                                     , ede_table_name=table_name
                                     , fixed_cols=['calendar_date', 'account_id','effective_date', 'population_type', 'model_id']
                                    )
    
    
    
    
    
    
  
  def update_training_variable_importance_fs_table(self, model, table_name):
    """
    Update variable importance contributions fs
    https://docs.h2o.ai/h2o/latest-stable/h2o-docs/variable-importance.html
    Input: h2o df, model, text name of table
    """
    # Set h2o context
    hc = H2OContext.getOrCreate()
    
    # get model_id from model object
    model_id = model.actual_params['model_id']
    
    # calculate permutation importance
    var_imp_pys = (spark
                   .createDataFrame(model.varimp(use_pandas=True).reset_index())
                   .withColumn("effective_date", F.lit(self.effective_date))
                   .withColumn("model_id", F.lit(model_id))
                   .withColumn("population_type", F.lit("training"))
                  )
    
    # write variable  importance  to fs
    self.write_data_to_fs(df=var_imp_pys
                          , is_var_imp_fs=True
                          , table_name=f"{self.database_prefix + self.env}.{table_name}"
                         )
    
    # write variable importance to EDE
    self.ede_pipeline.write_to_ede(df=var_imp_pys
                                   , mode="overwrite"
                                   , ede_table_name=table_name
                                  )
    
    
  def update_permutation_importance_fs_table(self, df, model, population_type, table_name):
    """
    Update permutation importance contributions fs
    Input: h2o df, model, population type to add label, text name of table
    """
    # Set h2o context
    hc = H2OContext.getOrCreate()
    
    # get model_id from model object
    model_id = model.actual_params['model_id']
    
    # calculate permutation importance
    perm_imp_pys = (pysh.safe_name(spark
                                   .createDataFrame(model.permutation_importance(df, use_pandas=True).reset_index())
                                   .withColumn("effective_date", F.lit(self.effective_date))
                                   .withColumn("model_id", F.lit(model_id))
                                   .withColumn("population_type", F.lit(population_type))
                                  )
                   )
    
    # write permutation feature importance  to fs
    self.write_data_to_fs(df=perm_imp_pys
                          , is_var_imp_fs=True
                          , table_name=f"{self.database_prefix + self.env}.{table_name}"
                         )
    
    # write permutation feature importance to EDE
    if (population_type != "validation"):
      self.ede_pipeline.write_to_ede(df=perm_imp_pys
                                     , mode="overwrite"
                                     , ede_table_name=table_name
                                    )
    
    
  
  def update_individual_contributions_fs_table(self, df, model, population_type, table_name): 
    """
    Update individual shap contributions + bias term fs
    Input: h2o df and h2o model, text name of table, text defining population type (i.e. training, validation, etc.)
    """
    # Set h2o context
    hc = H2OContext.getOrCreate()
    
    # get model_id from model object
    model_id = model.actual_params['model_id']
    
    # calculate shap contributions and relate back to the customer
    individual_contributions_h2o = df[['account_id','calendar_date']].cbind(model.predict_contributions(df))
    
    # convert to pyspark frame , handle datatypes , and add necessary primary key columns
    individual_contributions_pys = hc.asSparkFrame(h2oFrame=individual_contributions_h2o)
    individual_contributions_pys = (type_to_type(pysh.safe_name(individual_contributions_pys), 'tinyint', 'double')
                                    .withColumn("effective_date", F.lit(self.effective_date))
                                    .withColumn("model_id", F.lit(model_id))
                                    .withColumn("population_type", F.lit(population_type))
                                   )
    individual_contributions_pys.cache().count()
    
    # write individual contribution predictions to fs
    self.write_data_to_fs(df=individual_contributions_pys
                          , is_var_imp_fs=False
                          , table_name=f"{self.database_prefix + self.env}.{table_name}"
                         )
    
#     # comment out, because we don't write training/validation individual contribution predictions to EDE
#       self.ede_pipeline.write_to_ede(df=individual_contributions_pys
#                                      , mode="overwrite"
#                                      , ede_table_name=table_name
#                                     )
    individual_contributions_pys.unpersist()
  
    
  def write_data_to_fs(self, df, is_var_imp_fs, table_name="nginx_", description=''):
    """
    write df to feature store
    """
    # create database if not existing
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {self.database_prefix+self.env}")
    
    print("write data to table " + table_name)
    fs = FeatureStoreClient()
    try:
      print("Reading in table " + table_name)
      df_fs = fs.read_table(table_name)
      df_merge = unify_col_type(df_fs, df)  # unify col type before merge
      spark.sql(f"delete from {table_name} where effective_date = '{self.effective_date}'")
      print("Remove existing records for date:", self.effective_date)
      # when the table exists, merge it
      fs.write_table(
        name=table_name,
        df=df_merge,
        mode="merge")
    except:
      print("creating table")
      # when not exists, create the table
      if is_var_imp_fs==True:
        fs.create_table(
          name=table_name,
          primary_keys=['effective_date', 'population_type','variable'], # keep 1 set of records for each effective_date, remove "model_id"
          df = df,
          description='nginx data')
      else:
        fs.create_table(
          name=table_name,
          primary_keys=self.primary_keys + ['effective_date', 'population_type'], # keep 1 set of records for each effective_date, remove "model_id"
          df = df,
          description='nginx data')
    print(table_name + "updated successfully")
    
  def get_shap_individual_contribution_data(self, model_id, population_type="training", primary_keys=['calendar_date','account_id']):
    """
    read training data from fs
    """
    
    ### Add call to preprocessing 
    
    fs = FeatureStoreClient()
    if population_type=='validation':
      table_name = f"{self.database_prefix + self.env}.{self.validation_shap_individual_contributions_table_name}"
    if population_type=='training':
      table_name = f"{self.database_prefix + self.env}.{self.training_shap_individual_contributions_table_name}"
      
      
    latest_effective_date = (spark.sql(f"select max(effective_date) as latest_effective_date from {table_name} where model_id = '{model_id}'")
                             .collect()[0]["latest_effective_date"])
    
    # extract the shap contribution data on the latest_effective_date
    shap_contributions_data = (fs
                               .read_table(table_name)
                               .filter(F.col("effective_date") == latest_effective_date)
                               .filter(F.col("model_id") == model_id)
                              )
    return shap_contributions_data

  
  def get_training_var_importance_data(self, model_id):
    """
    read var importance data from fs
    inputs: model_id relates to the model path
    """
    
    ### Add call to preprocessing 
    
    fs = FeatureStoreClient()
    table_name = f"{self.database_prefix + self.env}.{self.training_variable_importance_table_name}"
    
    latest_effective_date = (spark.sql(f"select max(effective_date) as latest_effective_date from {table_name} where model_id = '{model_id}'")
                             .collect()[0]["latest_effective_date"])
    
    # extract the training data on the latest_effective_date
    var_importance = (fs
                       .read_table(table_name)
                       .filter(F.col("effective_date") == latest_effective_date)
                       .filter(F.col("model_id") == model_id)
                      )
    return var_importance
  
  def get_permutation_var_importance_data(self, model_id, population_type="training"):
    """
    read permutation importance data from fs
    inputs: population_type should be 'validation' or training, and model_id relates to the model path
    """
    
    ### Add call to preprocessing 
    
    fs = FeatureStoreClient()
    if population_type=='validation':
      table_name = f"{self.database_prefix + self.env}.{self.validation_permutation_importance_table_name}"
    if population_type=='training':
      table_name = f"{self.database_prefix + self.env}.{self.training_permutation_importance_table_name}"
    
    latest_effective_date = (spark.sql(f"select max(effective_date) as latest_effective_date from {table_name} where model_id = '{model_id}'")
                             .collect()[0]["latest_effective_date"])
    
    # extract the training data on the latest_effective_date
    perm_importance = (fs
                       .read_table(table_name)
                       .filter(F.col("effective_date") == latest_effective_date)
                       .filter(F.col("model_id") == model_id)
                      )
    return perm_importance
  
  def train(self, training_df, validation_df, params):
    """
    Train using train validation split using params provided
    Input: h2o training and validation dfs, dictionary params
    """
    
    # isolate predictors
    remove = self.primary_keys 
    predictors = [x for x in training_df.columns if x not in remove and x != self.dv]
    
    # set h2o context 
    hc = H2OContext.getOrCreate()
    
  
    # train
    model = H2OXGBoostEstimator(seed=123
                                  , **params
                                 )
    model.train(x=predictors
                  , y=self.dv
                  , training_frame = training_df
                  , validation_frame = validation_df
                  , verbose=True
                  )
    
    return model
    
    
    
  def get_metrics(self, model, df):
    """Get aucpr, auc, F1, F2, and MCC evaluation based on model and dataset"""
    performance = model.model_performance(df)
    
    # H2O Performance Documentation: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html

      ### Choose evaluation metric that evaluates the predicted class and is robust to imbalance data aka does not consider relative size of True Negatives
      ##### MCC - evaluates based on predicted class, good for imbalance data
      ##### F2 - evaluates based on predicted class, good for giving more weight on recall
      ##### AUCPR - evaluates based on predicted class, good for imbalance data
      
    performance_dict = {'aucpr':performance.aucpr()
                        , 'auc':performance.auc()
                        , 'F1':performance.F1()[0][1] # weighted harmonic mean of precision & recall -- equal weight
                        , 'F2':performance.F2()[0][1] # weighted harmonic mean of precision & recall -- more weight to recall , penalize false negative higher than false positive 
                        , 'mcc':performance.mcc()[0][1]
                       }
    return performance_dict
    
                  
  def train_and_tune(self, training_X_df, training_y_df, validation_X_df, validation_y_df):
    """ Train with parameter tuning"""
    
    def objective(params):
      """Train H2O XGBoost model using validation dataframe and parameters returning metric that should be minimized"""
      # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/xgboost.html
      
      with mlflow.start_run(experiment_id=experiment_id, run_name="CHILD_RUN", nested=True) as child_run:
        params['max_depth'] = int(params['max_depth'])
        params['ntrees'] = int(params['ntrees'])
        model = H2OXGBoostEstimator(seed=123
                                    , **params
                                   )

        model.train(x=predictors
                    , y=self.dv
                    , training_frame = training_df_h2o
                    , validation_frame = validation_df_h2o
                    , verbose=True
                    )

        train_performance_dict = self.get_metrics( model=model, df=training_df_h2o)
        train_performance_dict = {f"train_{k}" :v for k,v in train_performance_dict.items()}
        val_performance_dict = self.get_metrics( model=model, df=validation_df_h2o)
        val_performance_dict = {f"validation_{k}" :v for k,v in val_performance_dict.items()}

        metrics_dict = {**val_performance_dict,**train_performance_dict}
        mlflow.log_metrics(metrics_dict)
        tags = {'run_id':child_run.info.run_id
               }
        mlflow.set_tags(tags)

      return {'loss': -val_performance_dict['validation_aucpr'], 'status': STATUS_OK}

  

    # Set h2o context
    hc = H2OContext.getOrCreate()
    
    # train & validation split
    training_df = (training_X_df
               .join(other=training_y_df
                     , on=['calendar_date','account_id']
                     , how='inner'
                    )
              )
    validation_df = (validation_X_df
                   .join(other=validation_y_df
                         , on=['calendar_date','account_id']
                         , how='inner'
                        )
                  )
  
  
    # define predictors as all columns in dataframe except for DV & defined primary keys  
    remove = self.primary_keys 
    predictors = [x for x in training_df.columns if x not in remove and x != self.dv]
    training_df_h2o = hc.asH2OFrame(sparkFrame=training_df) 
    validation_df_h2o = hc.asH2OFrame(sparkFrame=validation_df) 
    
    # define search space
    search_space = {'max_depth': hp.choice('max_depth', range(2,20,2)),
                    'ntrees': hp.choice('ntrees',range(30,100,5)),
                    'reg_lambda': hp.loguniform('reg_lambda', 0,5),
                    'reg_alpha': hp.quniform('alpha',0,10,1),
                    'learn_rate': hp.quniform('learn_rate', 0.05, 0.5 , 0.05),
                    'scale_pos_weight': hp.quniform('scale_pos_wight',5,13,.5), #useful for imbalanced data - count(neg obs) / count(pos obs) or if very imbalance then conservatively, sqrt(count(neg obs) / count(pos obs))
                    'col_sample_rate_per_tree':hp.uniform('col_sample_rate_per_tree',0.5,1)
                   }
    
    with mlflow.start_run(experiment_id=self.experiment_id, run_name="test", nested=True) as run:
      best = fmin(fn=objective
                  , space=search_space
                  , algo=tpe.suggest
                  , max_evals=7 # keep small for testing purposes
                  , trials=Trials()
                 )
      
      best_param = space_eval(search_space, best)  # best may include index, use space_val to map it to values
      mlflow.log_params(best_param)
    
    
      best_model = self.train(training_df=training_df_h2o
                              , validation_df=validation_df_h2o
                              , params=best_param)
      
      # calculate and log metrics for best model for training and validation data
      train_performance_dict = self.get_metrics( model=best_model, df=training_df_h2o)
      train_performance_dict = {f"train_{k}" :v for k,v in train_performance_dict.items()}
      val_performance_dict = self.get_metrics( model=best_model, df=validation_df_h2o)
      val_performance_dict = {f"validation_{k}" :v for k,v in val_performance_dict.items()}

      metrics_dict = {**val_performance_dict,**train_performance_dict}
      mlflow.log_metrics(metrics_dict)
      
      # create confusion matrix based on F1 and log for train/validation sets
      training_confusion_matrix = best_model.confusion_matrix(train=True).table.as_data_frame()
      validation_confusion_matrix = best_model.confusion_matrix(valid=True).table.as_data_frame()

      training_confusion_matrix_path = "/tmp/nginx_training_confusion_matrix.csv"
      training_confusion_matrix.to_csv(training_confusion_matrix_path, index=False)
      mlflow.log_artifact(training_confusion_matrix_path)

      validation_confusion_matrix_path = "/tmp/nginx_validation_confusion_matrix.csv"
      training_confusion_matrix.to_csv(validation_confusion_matrix_path, index=False)
      mlflow.log_artifact(validation_confusion_matrix_path)

      # create model signature and log it with model
      model_signature = infer_signature(training_df.select(predictors), training_df.select(self.dv))

      # get model path
      model_abs_path = h2o.save_model(model = best_model, path=self.model_path)
      model_rel_path = os.path.basename(model_abs_path)

      # load the model
      saved_model = h2o.load_model(model_abs_path)

      mlflow.h2o.log_model(h2o_model=saved_model, artifact_path=model_rel_path, signature=model_signature)
      model_tags = {"model_path": model_rel_path,
                    "run_id":run.info.run_id
                   }
      mlflow.set_tags(model_tags)
      
      # if TRUE save shap feature contributions, permutation variable imp, and variable imp to feature store 
      if self.fs_update == True:
        self.update_feature_store(model=best_model, training_df=training_df_h2o, validation_df=validation_df_h2o)
    
      
    
    # model registry
    if(self.env == "qa"):
      print("Register model")
      model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run.info.run_id, artifact_path=model_rel_path)
      model_details = mlflow.register_model(model_uri=model_uri, name=self.model_name)   # create a model version
      self.wait_until_ready(self.model_name, model_details.version)

      print("transition from None to Staging")
      client = MlflowClient()
      client.transition_model_version_stage(name=self.model_name, version=model_details.version,
                                            stage="Staging")
      
      
  # Wait until the model is ready
  def wait_until_ready(self, model_name, model_version):
    client = MlflowClient()
    for _ in range(60):
      model_version_details = client.get_model_version(
        name=model_name,
        version=model_version)
      
      status = ModelVersionStatus.from_string(model_version_details.status)
      print("Model status: %s" % ModelVersionStatus.to_string(status))
      if status == ModelVersionStatus.READY:
        break
      time.sleep(1)

In [0]:
# tables = ['mlops_nginx_example_dev.nginx_training_permutation_importance_data'
#           , 'mlops_nginx_example_dev.nginx_training_shap_individual_contributions_data'
#           , 'mlops_nginx_example_dev.nginx_training_variable_importance_data'
#           , 'mlops_nginx_example_dev.nginx_validation_permutation_importance_data'
#           , 'mlops_nginx_example_dev.nginx_validation_shap_individual_contributions_data'
#          ]


# for table in tables:
#   print(table)
#   try:
#     spark.sql(f"DELETE FROM {table}") 
#   except:
#     print("Failed delete")
#   try:
#     spark.sql(f"VACUUM {table}")
#   except:
#     print("Failed vacuum")
#   try:
#     spark.sql(f"DROP TABLE IF EXISTS {table}")
#   except:
#     print("Failed drop")
#   print()
  