In [3]:
import altair as alt
import pandas as pd
from palmerpenguins import load_penguins
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

In [4]:
penguins = (
    load_penguins()
    .dropna()
    .drop('year', axis=1)
)

In [5]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male


In [6]:
X, y = (penguins[[c for c in penguins.columns if c != 'bill_length_mm']],
        penguins['bill_length_mm'])

X.head()

Unnamed: 0,species,island,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,18.7,181.0,3750.0,male
1,Adelie,Torgersen,17.4,186.0,3800.0,female
2,Adelie,Torgersen,18.0,195.0,3250.0,female
4,Adelie,Torgersen,19.3,193.0,3450.0,female
5,Adelie,Torgersen,20.6,190.0,3650.0,male


In [8]:
y.head()

0    39.1
1    39.5
2    40.3
4    36.7
5    39.3
Name: bill_length_mm, dtype: float64

In [9]:
X.dtypes

species               object
island                object
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [10]:
# categorical_cols = ['island', 'species', 'sex']

dummy_encode = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include=object)),
    remainder='passthrough'
)

pipeline = make_pipeline(dummy_encode, RandomForestRegressor()) 

In [11]:
pipeline

In [12]:
pipeline.fit(X, y)

In [13]:
pipeline.steps

[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('onehotencoder', OneHotEncoder(),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7f47360a5450>)])),
 ('randomforestregressor', RandomForestRegressor())]

In [14]:
feature_names = pipeline.named_steps['columntransformer'].get_feature_names_out()
feature_names

array(['onehotencoder__species_Adelie',
       'onehotencoder__species_Chinstrap',
       'onehotencoder__species_Gentoo', 'onehotencoder__island_Biscoe',
       'onehotencoder__island_Dream', 'onehotencoder__island_Torgersen',
       'onehotencoder__sex_female', 'onehotencoder__sex_male',
       'remainder__bill_depth_mm', 'remainder__flipper_length_mm',
       'remainder__body_mass_g'], dtype=object)

In [16]:
mod = pipeline.named_steps['randomforestregressor']
mod

In [17]:
importances = permutation_importance(
    mod, 
    dummy_encode.fit_transform(X), 
    y
)['importances_mean']

In [18]:
importances

array([1.55060615e+00, 1.12973652e-04, 1.91025057e-04, 1.61741635e-03,
       4.31858753e-03, 3.52698553e-03, 6.65044136e-02, 5.13065534e-02,
       1.21813087e-01, 1.00406710e-01, 7.34817464e-02])

In [19]:
df_importance = (
    pd.DataFrame(
        [dict(name=name, importance=importance) 
         for name, importance in zip(feature_names, importances)]
    )
    .sort_values(['importance'], ascending=False)
)

axis = alt.Axis(labelFontSize=14, titleFontSize=16, titlePadding=20, labelLimit=230)

chart = alt.Chart(df_importance, height=alt.Step(40)).mark_bar().encode(
    x=alt.X('importance', axis=axis),
    y=alt.Y('name', sort='-x', title='Feature', axis=axis)
)

chart