In [None]:
%matplotlib inline

In [None]:
import gc
import pathlib
import sys

In [None]:
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance

In [None]:
root_dir = "../.."

In [None]:
root_dir_path = pathlib.Path(root_dir)
data_dir_path = root_dir_path / "data"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
models_dir_path = root_dir_path / "models"
removed_features_path = models_dir_path / "removed_features.joblib"
src_dir_path = root_dir_path / "src"

In [None]:
sys.path.append(str(src_dir_path))

In [None]:
from package.constants import *

In [None]:
train = pd.read_parquet(train_path)

In [None]:
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/135896
is_train = (train["date"] >= train_start_date) & (train["date"] < validation_start_date)
is_valid = train["date"] >= validation_start_date

In [None]:
params = {
    "bagging_fraction": 0.75,
    "bagging_freq": 1,
    "feature_fraction": 0.8,
    "lambda_l2": 0.1,
    "learning_rate": 0.075,
    "metric": "rmse",
    "min_data_in_leaf": 104,
    "n_jobs": -1,
    "num_leaves": 128,
    # See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/150614
    "objective": "tweedie",
    "seed": random_state,
    # Seehttps://www.kaggle.com/c/m5-forecasting-accuracy/discussion/151145
    "tweedie_variance_power": 1.2,
}

In [None]:
dtrain = lgb.Dataset(
    train.loc[is_train, features],
    train.loc[is_train, target],
    categorical_feature=categorical_features,
)
dvalid = lgb.Dataset(
    train.loc[is_valid, features],
    train.loc[is_valid, target],
    categorical_feature=categorical_features,
)

In [None]:
%%time
model = lgb.train(
    params,
    dtrain,
    early_stopping_rounds=30,
    num_boost_round=1_250,
    valid_sets=[dtrain, dvalid],
    verbose_eval=10,
)

In [None]:
ax = lgb.plot_importance(model, importance_type="split", figsize=(16, 9))

In [None]:
ax = lgb.plot_importance(model, importance_type="gain", figsize=(16, 9))

In [None]:
estimator = lgb.LGBMRegressor()
estimator._Booster = model
estimator._n_features = model.num_feature()

In [None]:
%%time
result = permutation_importance(
    estimator,
    train.loc[is_valid, features],
    train.loc[is_valid, target],
    random_state=random_state,
    scoring="neg_root_mean_squared_error",
)

In [None]:
feature_importances = pd.Series(result.importances_mean, index=features)

In [None]:
feature_importances.plot(figsize=(16, 9), kind="barh")

In [None]:
is_removed = feature_importances <= 0.0
feature_importances = feature_importances[is_removed]
removed_features = feature_importances.index
removed_features = removed_features.tolist()

In [None]:
removed_features

In [None]:
joblib.dump(removed_features, removed_features_path)