In [0]:
%pip install catboost shap scikit-learn mlflow --upgrade
%restart_python

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt
import numpy as np

# added catboost
from catboost import CatBoostRegressor

plt.style.use('default')

## Scikit Learn Pseudocode Pipeline

In [0]:
# 0. Read the csv into a pandas df
df = pd.read_parquet('./fake_sku_data.parquet')

## PSEUDOCODE FOR PUSHING DATA PROCESSING INTO SPARK ##
# df = spark.table(cat.sch.table).filter(region == 'X').orderBY(F.rand()).limit(1000000).toPandas()


# df = spark.sql(f"""
#   SELECT * FROM data_table
#   WHERE sku in ({sku_list})
# "").filter(f"sku IN ()").orderBY(F.rand()).limit(1000000).toPandas()

In [0]:
# 1. StringIndex categorical columns (Label Encoding or OneHotEncoding for sklearn)
categorical_cols = [
    'store_id', 'product_id', 'store_size', 'province', 'region_name',
    'area_name', 'zone_id', 'zone_desc', 'category_id',
    'category_name', 'product_desc', 'province_simple', 'region_simple', 'location_type', 'urban_rural'
]
numerical_cols = [col for col in df.columns if col not in categorical_cols]

In [0]:
# 2. Assemble features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

In [0]:
# 3. Random Forest
rf = RandomForestRegressor(random_state=42)

In [0]:
# 4. Full pipeline: indexers + assembler + regressor
pipeline = Pipeline([
    ('preproc', preprocessor),
    ('regressor', rf)
])

In [0]:
# Filter top 120 by rank (sku_r12_sales_ratio_rank)
filtered_df = df[df['sku_sales_ratio_rank'] <= 120]
X = filtered_df[categorical_cols + numerical_cols]
y = filtered_df['units_sold_period']

# 5. Union the test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
## Let's activate autologging ##
import mlflow
mlflow.sklearn.autolog()

# 6. Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 5]
}
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)

In [0]:
# 7. Evaluate on train set
train_preds = grid.predict(X_train)
print('Train RMSE:', np.sqrt(mean_squared_error(y_train, train_preds)))
print('Train R2:', r2_score(y_train, train_preds))

In [0]:
test_preds = grid.predict(X_test)
print('Train RMSE:', np.sqrt(mean_squared_error(y_test, test_preds)))
print('Train R2:', r2_score(y_test, test_preds))

In [0]:
# An example of reloading the model
reload_model = mlflow.sklearn.load_model('models:/shm.default.sku_model@current')
reload_model.predict(X_test)

In [0]:
# 8. Show predictions
test_preds = grid.predict(X_test)
print(test_preds[:10])  # Show first 10 preds

In [0]:
# 9. Re-create and fit the model with the best parameters
best_pipeline = grid.best_estimator_
best_pipeline.fit(X_train, y_train)

In [0]:
# New - set custom params
pipeline.set_params(
    regressor__n_estimators=100,
    regressor__max_depth=10
)
pipeline.fit(X_test, y_test)

In [0]:
# 10. Use same indexers and assembler as before (already done above)
# 11. Get feature importances
rf_model = best_pipeline.named_steps['regressor']
importances = rf_model.feature_importances_

In [0]:
# 12. Map importances to feature names
feature_names = (
    best_pipeline.named_steps['preproc']
    .transformers_[0][1]
    .get_feature_names_out(categorical_cols)
    .tolist()
    + numerical_cols
)

indices = np.argsort(importances)[::-1][:10]  # Indices of top 10 importances

top_features = [feature_names[i] for i in indices]
top_importances = [importances[i] for i in indices]

plt.figure(figsize=(8, 6))
plt.barh(top_features, top_importances, color='skyblue')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances from Random Forest Model')
plt.gca().invert_yaxis()  # Most important at the top
plt.tight_layout()
plt.show()

## SHAP Feature Importance

In [0]:
# Sample 100 rows from X_test for SHAP
X_sample = X_test.sample(100, random_state=42)

# Transform data as pipeline does
X_transformed = best_pipeline.named_steps['preproc'].transform(X_sample)

# Ensure data is pure float
X_transformed = X_transformed.astype(np.float32, casting='unsafe')

# Run SHAP as before
explainer = shap.TreeExplainer(best_pipeline.named_steps['regressor'])
shap_values = explainer.shap_values(X_transformed)
shap.summary_plot(shap_values, X_transformed, feature_names=feature_names, max_display=10)

In [0]:
index_of_total_ppl = list(X.columns).index('total_ppl')

# Use original feature names from after preprocessing if possible, otherwise use column index
shap.dependence_plot(
    index_of_total_ppl,            # index of total_ppl
    shap_values,
    X_transformed,
    feature_names=list(X.columns)  # for correct labeling
)

## Catboost
We can also use a Catboost model and bring the whole workflow together. The hardest part of Catboost is avoiding it overfitting too much!! But it is going to crush LightGBM or Random Forest 95% of the time.

In [0]:
mlflow.autolog()

with mlflow.start_run():
    # Build pipeline: preprocess + CatBoost
    model = CatBoostRegressor(
      iterations=100, 
      learning_rate=0.1, 
      depth=3, 
      verbose=0,
      cat_features=categorical_cols)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation
    print('Test RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('Test R2:', r2_score(y_test, y_pred))