In [214]:
import polars as pl
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance

In [215]:
X_train_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/train_X.parquet"
).select(pl.exclude("datetime_utc"))
X_test_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/test_X.parquet"
).select(pl.exclude("datetime_utc"))
y_train_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/train_y.parquet"
).select(pl.exclude("datetime_utc")).to_series()
y_test_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/test_y.parquet"
).select(pl.exclude("datetime_utc")).to_series()



In [216]:
model = HistGradientBoostingRegressor(loss="absolute_error")
model.fit(X_train_raw, y_train_raw)
prediction_raw = model.predict(X_test_raw)
prediction_raw

array([  85.96099928,   26.6655113 ,   21.98719817, ...,  -61.73716086,
       -112.73627638,  -20.02143533])

In [217]:
test_prediction_df = y_test_raw.to_frame().with_columns(pl.Series(name="prediction_gradient_boosting", values = prediction_raw.squeeze()))
test_prediction_df

target,prediction_gradient_boosting
f32,f64
-50.097,85.960999
41.205002,26.665511
10.788,21.987198
28.538,47.518062
11.015,10.061042
…,…
-140.723007,-48.218186
-105.958,-72.953968
-7.133,-61.737161
2.132,-112.736276


In [218]:
root_mean_squared_error(
    y_true = test_prediction_df.select(pl.col("target")),
    y_pred = test_prediction_df.select(pl.col("prediction_gradient_boosting"))
)

115.57830736758078

In [219]:
test_prediction_df.write_parquet("/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/results/rmse_loss_gradient_boosting.parquet")

In [220]:
importances = permutation_importance(
    model,
    X=X_train_raw,
    y=y_train_raw,
    scoring = "neg_root_mean_squared_error",
    n_repeats=5,    
)



In [221]:
importance_df = pl.DataFrame(
    {
        "var": X_train_raw.columns,
        "importance":importances['importances_mean']
    }
)

importance_df = importance_df.sort(by = pl.col("importance"), descending = True)
importance_df

var,importance
str,f64
"""lag_25_si_cumulative_minute""",23.734138
"""lag_60_si_qh""",3.178572
"""lag_60_si_cumulative_minute""",2.819361
"""lag_30_nrv_cumulative_minute""",2.057134
"""lag_1440_si_qh""",2.001792
…,…
"""lag_62_nrv_cumulative_minute""",0.032036
"""lag_61_nrv_cumulative_minute""",0.031498
"""lag_33_nrv_cumulative_minute""",0.027573
"""lag_63_nrv_cumulative_minute""",0.027473
