In [0]:
%run ./feature_engineering

Out[31]: {'EdLevel': {'Primary/elementary school': 1.0,
  'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 2.0,
  'Associate degree (A.A., A.S., etc.)': 3.0,
  'Some college/university study without earning a degree': 4.0,
  'Something else, Professional degree (JD, MD, etc.)': 5.0,
  'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 6.0,
  'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 7.0,
  'Other doctoral degree (Ph.D., Ed.D., etc.)': 8.0},
 'Age1stCode': {'Younger than 5 years': 1.0,
  '5 - 10 years': 2.0,
  '11 - 17 years': 3.0,
  '18 - 24 years': 4.0,
  '25 - 34 years': 5.0,
  '35 - 44 years': 6.0,
  '45 - 54 years': 7.0,
  '55 - 64 years': 8.0,
  'Older than 64 years': 9.0},
 'OrgSize': {'Just me - I am a freelancer, sole proprietor, etc.': 1.0,
  '2 to 9 employees': 2.0,
  '10 to 19 employees': 3.0,
  '20 to 99 employees': 4.0,
  '100 to 499 employees': 5.0,
  'I don’t know': 6.0,
  '500 to 999 employees': 7.0,
  '1,000 to 4,999 employees

  hook(module)


In [0]:
from dataclasses import dataclass, field
import mlflow
import mlflow.spark

In [0]:
TRANSFORMED_TARGET = f"transformed_{TARGET_COL}"
SEED: int = 42

In [0]:
# https://statisticsbyjim.com/regression/r-squared-invalid-nonlinear-regression/
def calc_metrics(
    *,
    df: DataFrame,
    model_name: str,
    ds_name: str = "test",
    target_col: str = TRANSFORMED_TARGET,
    pred_col: str = "prediction",
) -> Tuple[float, float, float]:
    """The function gets the dataframe of the prediction and the model name, and prints the evaluation metrics (RMSE, RMSE vs mean)
    :param df: A pyspark.sql.dataframe.DataFrame object.
    :parma model_name: The model name as identifier.
    :parma ds_name: The dataset name as identifier: train or test.
    :parma target_col: The target column's name.
    :return: Tuple[float, float, float]
    """
    evaluator = RegressionEvaluator(labelCol=target_col, predictionCol=pred_col)
    _rmse = evaluator.evaluate(df)
    print(f"{model_name} - {ds_name} - evaluation metrics:\n", "=" * 100)
    print(f"RMSE on {ds_name} set: {_rmse: .1f}")
    _mean = df.select(f.mean(target_col)).collect()[0][0]
    print(f"The mean value on the {ds_name} set: {_mean: .0f}")
    rmse_mean_ratio = _rmse / _mean
    print(f"RMSE mean ratio on {ds_name} set: {rmse_mean_ratio: .4f}\n", "=" * 100)
    return _rmse, _mean, rmse_mean_ratio

In [0]:
@dataclass
class Experiment:
    """A class for running ML experiments
    Attributes:
    df: A pyspark.sql.dataframe.DataFrame object. The dataset of the total data.
    model_name: str. The name as identifier of the model.
    predictor: ModelRegressor. The predictor model.
    test_fraction: float. The fraction of the test dataset from total data. Set to default 0.2.
    seed: int. A seed number for consistency.
    target_col: str. The target variable to predict. Set to default "ConvertedCompYearly".
    train: A pyspark.sql.dataframe.DataFrame object. The train dataset of the data.
    test: A pyspark.sql.dataframe.DataFrame object. The test dataset of the data.
    """

    df: DataFrame
    model_name: str
    predictor: ModelRegressor
    target_col: str = field(init=False, default=TRANSFORMED_TARGET)
    test_fraction: float = field(init=False, default=0.2)
    seed: int = field(init=False, default=SEED)
    train: DataFrame = field(init=False)
    test: DataFrame = field(init=False)
    transformed_train: DataFrame = field(init=False)
    transformed_test: DataFrame = field(init=False)
    lmda: float = field(init=False)
    
    def __post_init__(self):
        self.train, self.test = self.df.randomSplit(
            [1.0 - self.test_fraction, self.test_fraction], seed=self.seed
        )
        self.transformed_train, self.lmda = yeojohnson_transform_on_train_test(df=self.train)
        self.transformed_test, _ = yeojohnson_transform_on_train_test(df=self.test, lmda=self.lmda)

In [0]:
%%add_to Experiment
def get_pipeline(self):
  stages = get_pipe_stages(df=self.df)
  stages += [self.predictor]
  return Pipeline(stages=stages)

In [0]:
%%add_to Experiment
def log_mlflow_info(self, train_df: DataFrame, test_df: DataFrame) -> None:
  """The function prints logged information of the mlflow run.
  :param df: A pyspark.sql.dataframe.DataFrame object.
  :parma model_name: The model name as identifier. 
  :return: None - No returned value.
  """
  import logging
  logging.getLogger("mlflow").setLevel(logging.DEBUG)
  
  mlflow.autolog(log_models=False)
  train_count, test_count = self.train.count(), self.test.count()
  mlflow.log_param('train_count', train_count)
  mlflow.log_param('test_count', test_count)
  print('\033[36m' + f'Train set size: {train_count}')
  print('\033[36m' + f'Test set size: {test_count}')
  mlflow.log_param('yeojohnson lambda:', self.lmda)
  print(f"\nyeojohnson lambda: {self.lmda}\n", "=" * 100)
  # Calculate metrics
  train_rmse, train_mean, train_rmse_mean_ratio = calc_metrics(df=train_df, model_name=self.model_name, ds_name='train')
  test_rmse, test_mean, test_rmse_mean_ratio = calc_metrics(df=test_df, model_name=self.model_name)
  # Log metrics
  mlflow.log_metric("train_rmse", train_rmse)
  mlflow.log_metric("train_mean", train_mean)
  mlflow.log_metric("train_rmse_mean_ratio", train_rmse_mean_ratio)
  mlflow.log_metric("test_rmse", test_rmse)
  mlflow.log_metric("test_mean", test_mean)
  mlflow.log_metric("test_rmse_mean_ratio", test_rmse_mean_ratio)
  return None

In [0]:
%%add_to Experiment
def run_mlflow(self) -> DataFrame:
  pipeline = self.get_pipeline()
  mlflow.autolog(log_models=False)
  with mlflow.start_run() as run:
    model = pipeline.fit(self.transformed_train)
    # predict on train set for comparing evaluation metrics between train and test sets.
    train_pred = model.transform(self.transformed_train) 
    test_pred = model.transform(self.transformed_test)    
    # Log the best model.
    mlflow.spark.log_model(spark_model=model, artifact_path=self.model_name)
    # log mlflow run info
    self.log_mlflow_info(train_df=train_pred, test_df=test_pred)
    # inverse yeo-johnson transformation on prediction column
    # Save the predictions data
    save_table(df=train_pred, file_path=f's3a://{S3_GOLD_PATH}train_{self.model_name}.parquet')
    save_table(df=test_pred, file_path=f's3a://{S3_GOLD_PATH}test_{self.model_name}.parquet')

  return None