In [0]:
import mlflow
import databricks.automl_runtime
from pyspark.sql.functions import when, col, log
from sklearn.model_selection import train_test_split
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PowerTransformer
from scipy import stats
import numpy as np
from scipy.special import inv_boxcox


## Read/Clean/Transform Data

In [0]:
# %run "./2_imacs_engineering_hours_data_cleaning_&_data_tranformation"

In [0]:
filepath=["dbfs:/FileStore/Imacs/df_data_21.csv"]
raw_df = spark.read.csv(filepath, sep=",", header=True, inferSchema=True)
raw_df = (
    raw_df
    .drop('ProjectNo', 'industry_codes')
    .filter(col('total_sell_price') <= 1E8)
    .filter(col('hours') > 0)
    .filter(col('hours') < 1500)
    .filter(col('stock_count') < 500)
)
target_col = "hours"

raw_pd = raw_df.toPandas()

df_transformed = raw_pd.copy()
df_transformed['transformed_hours'], lmda = stats.boxcox(df_transformed['hours'])

# # validation check
# # df_transformed['hours_check'] = inv_boxcox(df_transformed['transformed_hours'], lmda)
# # display(df_transformed)

raw_pd['hours'] = df_transformed['transformed_hours']

raw_df = spark.createDataFrame(raw_pd)

raw_df.count()


## Split Data

In [0]:
train_df, test_df = raw_df.randomSplit([.8, .2], seed=42)

## Preprocessing Data

In [0]:
%run "./3_imacs_engineering_hours_preprocessing"

## Train regression model
- Log relevant metrics to MLflow to track runs
- All the runs are logged under [this MLflow experiment](#mlflow/experiments/3959399840601507)
- Change the model parameters and re-run the training cell to log a different trial to the MLflow experiment
- To view the full list of tunable hyperparameters, check the output of the cell below

In [0]:
from xgboost import XGBRegressor

help(XGBRegressor)

### Define the objective function
The objective function used to find optimal hyperparameters. By default, this notebook only runs
this function once (`max_evals=1` in the `hyperopt.fmin` invocation) with fixed hyperparameters, but
hyperparameters can be tuned by modifying `space`, defined below. `hyperopt.fmin` will then use this
function's return value to search the space to minimize the loss.

In [0]:
# param_grid = {
#     "regressor__max_depth": [3],  # around 3
#     "regressor__learning_rate": [0.3, 0.34, 0.36],  # around 0.336
#     "regressor__n_estimators": [300, 328],  # around 328
#     "regressor__min_child_weight": [6, 7, 8],  # around 7
#     "regressor__subsample": [0.6, 0.61],  # around 0.606
#     "regressor__colsample_bytree": [0.58, 0.592],  # around 0.592
# }

# from sklearn.model_selection import GridSearchCV

# # Initialize XGBRegressor with minimal params, just random_state for reproducibility
# xgboost = XGBRegressor(random_state=98223983, verbosity=0, n_jobs=100)

# # Update pipeline with new regressor instance
# pipeline = Pipeline([
#     ("column_selector", col_selector),
#     ("preprocessor", preprocessor),
#     ("regressor", xgboost),
# ])


# grid_search = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     cv=3,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,
#     verbose=2,
# )

# grid_search.fit(X_train, y_train)

# best_model = grid_search.best_estimator_


In [0]:
from sklearn.pipeline import Pipeline
# Create a separate pipeline to transform the validation dataset. This is used for early stopping.
params = {
  "colsample_bytree": 0.592281337798936,
  "learning_rate": 0.336056980325921,
  "max_depth": 3,
  "min_child_weight": 7,
  "n_estimators": 328,
  "n_jobs": 100,
  "subsample": 0.606463396485774,
  "verbosity": 0,
  "random_state": 98223983,
}

xgboost = XGBRegressor(**params)

pipeline = Pipeline([
        ("column_selector", col_selector),
        ("preprocessor", preprocessor),
        ("regressor", xgboost),
        ])


train_pd = train_df.toPandas()
X_train = train_pd.drop(columns=["hours"])
y_train = train_pd["hours"]
pipeline_model = pipeline.fit(X_train, y_train)


In [0]:
from pyspark.sql.functions import exp, col


test_pd = test_df.toPandas()
X_test = test_pd.drop(columns=["hours"])
y_test = test_pd["hours"]

# Predict on test set
y_pred = pipeline_model.predict(X_test)

# Add prediction column to pandas DataFrame
test_pd["prediction"] = inv_boxcox(y_pred, lmda)
test_pd['hours'] = inv_boxcox(test_pd["hours"], lmda)

# Convert pandas DataFrame back to Spark
inv_pred_df = spark.createDataFrame(test_pd)

# Exponentiate the prediction column to reverse log()
# inv_xgboost_df = log_pred_df.withColumn("prediction", inv_boxcox(col("prediction"), lmda))

# Display actual vs predicted hours
# display(inv_xgboost_df.select("hours", "prediction"))
# display(inv_pred_df)


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="hours", metricName="rmse")

rmse = regression_evaluator.evaluate(inv_pred_df)
r2 = regression_evaluator.setMetricName("r2").evaluate(inv_pred_df)
mae = regression_evaluator.setMetricName("mae").evaluate(inv_pred_df)
print(f"RMSE is {rmse}")
print(f"MAE is {mae}")
print(f"R2 is {r2}")

In [0]:
# MLflow logging and model registration
with mlflow.start_run(run_name="xgboost_pipeline_model"):
    # Log hyperparameters
    mlflow.log_params(params)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Log model
    mlflow.sklearn.log_model(pipeline_model, "model")

    # Register model in Databricks Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
    mlflow.register_model(model_uri, "XGBoost_EngineeringHoursModel")

In [0]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
latest_versions = client.get_latest_versions("XGBoost_EngineeringHoursModel")
for version in latest_versions:
    print(f"Version: {version.version}, Stage: {version.current_stage}")


In [0]:
client.transition_model_version_stage(
    name="XGBoost_EngineeringHoursModel",
    version=version.version,
    stage="Staging",
    archive_existing_versions=True  # Optional: archives previous Production version
)


In [0]:
client.transition_model_version_stage(
    name="XGBoost_EngineeringHoursModel",
    version=version.version,
    stage="Production",
    archive_existing_versions=True  # Optional: archives previous Production version
)


In [0]:
# Compute RMSE
regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="hours", metricName="rmse")
rmse = regression_evaluator.evaluate(inv_pred_df)

# Compute R² and MAE
r2 = regression_evaluator.setMetricName("r2").evaluate(inv_pred_df)
mae = regression_evaluator.setMetricName("mae").evaluate(inv_pred_df)

# Compute IQR
q1, q3 = inv_pred_df.approxQuantile("hours", [0.25, 0.75], 0.01)
iqr = q3 - q1

# Compute RMSE / IQR
rmse_iqr = rmse / iqr if iqr != 0 else float("inf")

# Print results
print(f"RMSE is {rmse}")
print(f"MAE is {mae}")
print(f"R2 is {r2}")
print(f"IQR is {iqr}")
print(f"RMSE / IQR is {rmse_iqr}")


In [0]:
# Count number of observations (n)
n = inv_pred_df.count()

# Fallback Option: Estimate k from columns (excluding label and prediction)
k = len([col for col in inv_pred_df.columns if col not in ['hours', 'prediction']])

# Compute Adjusted R² safely
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1)) if (n - k - 1) != 0 else float("nan")

print(f"Adjusted R2 is {adjusted_r2}")