In [0]:
%run ./plots

In [0]:
%run ./ml

Out[45]: {'EdLevel': {'Primary/elementary school': 1.0,
  'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 2.0,
  'Associate degree (A.A., A.S., etc.)': 3.0,
  'Some college/university study without earning a degree': 4.0,
  'Something else, Professional degree (JD, MD, etc.)': 5.0,
  'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 6.0,
  'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 7.0,
  'Other doctoral degree (Ph.D., Ed.D., etc.)': 8.0},
 'Age1stCode': {'Younger than 5 years': 1.0,
  '5 - 10 years': 2.0,
  '11 - 17 years': 3.0,
  '18 - 24 years': 4.0,
  '25 - 34 years': 5.0,
  '35 - 44 years': 6.0,
  '45 - 54 years': 7.0,
  '55 - 64 years': 8.0,
  'Older than 64 years': 9.0},
 'OrgSize': {'Just me - I am a freelancer, sole proprietor, etc.': 1.0,
  '2 to 9 employees': 2.0,
  '10 to 19 employees': 3.0,
  '20 to 99 employees': 4.0,
  '100 to 499 employees': 5.0,
  'I don’t know': 6.0,
  '500 to 999 employees': 7.0,
  '1,000 to 4,999 employees

  hook(module)


In [0]:
import pandas as pd



In [0]:
if TYPE_CHECKING:
    from pyspark.ml.pipeline import PipelineModel
else:
    PipelineModel = object

In [0]:
TRAIN_RF_FILE_NAME = "train_rf_reg"
TEST_RF_FILE_NAME = "test_rf_reg"
TRAIN_XGB_FILE_NAME = "train_xgb_reg"
TEST_XGB_FILE_NAME = "test_xgb_reg"

In [0]:
xgb_train_df = spark.read.parquet(
    f"s3a://{S3_GOLD_PATH}{TRAIN_XGB_FILE_NAME}.parquet"
).cache()
xgb_test_df = spark.read.parquet(
    f"s3a://{S3_GOLD_PATH}{TEST_XGB_FILE_NAME}.parquet"
).cache()

In [0]:
train_xgb_actualAndPred = xgb_train_df.select("ConvertedCompYearly", "transformed_ConvertedCompYearly", "prediction")
test_xgb_actualAndPred = xgb_test_df.select("ConvertedCompYearly", "transformed_ConvertedCompYearly", "prediction")

In [0]:
xgb_pred_plot = Plot(
    df=test_xgb_actualAndPred,
    target_col = f"transformed_{TARGET_COL}",
    col_name="prediction",
    title="XGBRegressor - Predictions vs Actual values - Test set",
)

In [0]:
xgb_pred_plot.plot_pred_vs_actual()

In [0]:
calc_metrics(df=train_xgb_actualAndPred, ds_name="train", model_name=TRAIN_XGB_FILE_NAME)

train_xgb_reg - train - evaluation metrics:
RMSE on train set:  7.4
The mean value on the train set:  59
RMSE mean ratio on train set:  0.1252
Out[78]: (7.444889225587139, 59.4777996811407, 0.12517089175287321)

In [0]:
calc_metrics(df=test_xgb_actualAndPred, model_name=TEST_XGB_FILE_NAME)

test_xgb_reg - test - evaluation metrics:
RMSE on test set:  7.5
The mean value on the test set:  60
RMSE mean ratio on test set:  0.1263
Out[77]: (7.549982160261912, 59.75562071585597, 0.12634764846913468)

In [0]:
# Load xgb pipeline
xgb_logged_pipeline = 'runs:/114d2824bb63443299cda7e86c26f346/xgb_reg'
# Load pipeline
xgb_loaded_pipeline = mlflow.spark.load_model(xgb_logged_pipeline)

2023/05/20 05:12:24 INFO mlflow.spark: 'runs:/114d2824bb63443299cda7e86c26f346/xgb_reg' resolved as 'dbfs:/databricks/mlflow-tracking/982942755585532/114d2824bb63443299cda7e86c26f346/artifacts/xgb_reg'
The xgboost training will use single worker and set nthread=1 (equal to `spark.task.cpus` config), If you need to increase threads number used in training, you can set `nthread` param.


In [0]:
def map_idx_feature(*, df: DataFrame) -> Dict[str, str]:
    """The function extract the metadata from the DataFrame and map the index (key) to feature name (value)
    :param df: A pyspark.sql.dataframe.DataFrame object.
    :return: Dict[str, str]
    """
    metadata = df.select("features").schema[0].metadata["ml_attr"]["attrs"]
    list_of_idx_dicts = metadata["numeric"] + metadata["binary"]
    idx_dict = {}
    for d in list_of_idx_dicts:
        idx_dict["f" + str(d["idx"])] = d["name"]
    return idx_dict

In [0]:
def map_features_imp(
    *, idx_dict: Dict[str, str], pipeline: PipelineModel
) -> Dict[str, float]:
    """The function taks the idx_dict and the model and map the importance score (value) to the feature name (key).
    :param idx_dict: Dict[str, str] . The dictionary tha maps the index (key) to the feature name (value)
    :param pipeline: PipelineModel. The loaded pipeline.
    :return features_imp: Dict[str, float]. A dictionary with the featuer name as the key and the score as the value.
    """
    xgb = model.stages[-1].bestModel
    idx_imp_dict = xgb.get_feature_importances()
    features_imp = {}
    for idx, score in idx_imp_dict.items():
        feature_name = idx_dict[idx]
        features_imp[feature_name] = score
    return features_imp

In [0]:
def get_features_imp(*, df: DataFrame, pipeline: PipelineModel) -> DataFrame:
    """The function returns a DataFrame with the features' names and their importance
    :param df: A pyspark.sql.dataframe.DataFrame object.
    :param pipeline: PipelineModel. The loadded pipeline.
    :return grouped_df: A pyspark.sql.dataframe.DataFrame object. A DataFrame with 2 columns: feature name and its importacne.
    """
    idx_feature_dict = map_idx_feature(df=df)
    feature_imp_dict = map_features_imp(idx_dict=idx_dict, model=xgb_loaded_model)
    feature_imp_df = pd.DataFrame(
        data=feature_imp_dict.items(), columns=["feature", "importance"]
    )
    feature_imp_df["feature_name"] = (
        feature_imp_df["feature"].str.split("IndexEncoded").str[0]
    )
    grouped_df = (
        feature_imp_df[["feature_name", "importance"]]
        .groupby("feature_name")["importance"]
        .max()
        .sort_values(ascending=False)
    )
    return grouped_df

In [0]:
feature_imp_df = get_features_imp(df=xgb_train_df, pipeline=xgb_loaded_pipeline)
feature_imp_df

<class 'pyspark.ml.pipeline.PipelineModel'>
Out[126]: feature_name
YearsCodePro_rank                                    48.0
OrgSize_rank                                         47.0
Ethnicity                                            37.0
Employment                                           24.0
Country                                              18.0
DevType_Student                                      17.0
Total_WebframeWantToWorkWith                         13.0
Total_ToolsTechHaveWorkedWith                        13.0
OpSys                                                13.0
LanguageHaveWorkedWith_PHP                           12.0
Total_NEWCollabToolsHaveWorkedWith                   11.0
DevType_Engineering manager                          10.0
PlatformHaveWorkedWith_AWS                            9.0
Total_DatabaseWantToWorkWith                          8.0
NEWSOSites                                            6.0
ToolsTechHaveWorkedWith_Kubernetes                    6.0
Tools

In [0]:
len(feature_imp_df)

Out[136]: 41

In [0]:
colors = ["#DC143C"] * 7 + ["#EC3257"] * 7 + ["#F05C79"] * 7 +  ["lightslategray"] * 21

In [0]:
plot_feature_importance(df=feature_imp_df, colors=colors)

#### The top 5 important features are: YearsCodePro, OrgSize, Ethnicity, Employment and Country