In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [29]:
from data_helper import *

train = dataset_in["train"]["full"].copy()
train["y"] = dataset_in["train"]["y"]

features = list(train.columns)
features.remove('y')

In [30]:
def normalise_importance(x):
    absolute = np.absolute(x)
    return absolute /np.sum(absolute)

In [34]:
from sklearn.linear_model import LinearRegression

linreg_importances = []

for i in range(10):
    sample = train.sample(frac = 0.5)

    X_train = sample.drop('y', axis = 1)
    y_train = np.ravel(sample[['y']])
    
    model = Pipeline([
        ("scale", StandardScaler()),
        ("ml_model", LinearRegression())
    ])
    model.fit(X_train, y_train)
    linreg_importances.append(normalise_importance(model.named_steps["ml_model"].coef_))

linreg = pd.DataFrame(linreg_importances, columns = features)

In [36]:
linreg_importances

[array([8.60537603e-02, 8.59975704e-03, 2.60247588e-03, 1.17923729e-01,
        6.02925561e-03, 2.23241771e-03, 6.09475460e-15, 3.59044639e-15,
        1.03232318e-01, 8.88867565e-02, 4.77889222e-04, 6.69685181e-03,
        4.90309783e-15, 8.70554139e-02, 1.92164725e-03, 1.26191301e-02,
        2.22659717e-03, 1.96835375e-03, 2.49617117e-03, 1.42561202e-04,
        7.48897850e-02, 4.84245980e-14, 8.56459341e-15, 1.20864996e-01,
        6.79820041e-03, 1.61021391e-14, 2.01242663e-03, 9.25678478e-03,
        8.67684748e-02, 1.77134969e-03, 1.67634422e-02, 2.03774818e-02,
        6.06152723e-03, 1.80480446e-04, 1.96805967e-03, 4.62939687e-04,
        9.97359731e-03, 4.81068269e-04, 4.20991094e-14, 4.87606126e-04,
        1.62806677e-02, 1.01331823e-02, 2.21014567e-03, 1.37645549e-04,
        1.73584180e-03, 6.00277514e-04, 7.51288390e-02, 2.89385444e-03,
        5.96241065e-04]),
 array([1.24722741e-03, 7.91501232e-03, 4.49391112e-02, 1.69740257e-03,
        5.55118104e-03, 1.16575435e-02

In [42]:
from sklearn.ensemble import RandomForestRegressor

rfr_importances = []

for i in range(10):
    sample = train.sample(frac = 0.5)

    X_train = sample.drop('y', axis = 1)
    y_train = np.ravel(sample[['y']])
    
    model = Pipeline([
        ("scale", StandardScaler()),
        ("ml_model", RandomForestRegressor(max_samples = 0.7))
    ])
    model.fit(X_train, y_train)
    rfr_importances.append(model.named_steps["ml_model"].feature_importances_)
    
    print(i, end = '\r')

rfr = pd.DataFrame(rfr_importances, columns = features)

9

In [44]:
rfr.melt()

Unnamed: 0,variable,value
0,nocentralheating_rate.origin,0.001364
1,nocentralheating_rate.origin,0.001409
2,nocentralheating_rate.origin,0.001548
3,nocentralheating_rate.origin,0.001313
4,nocentralheating_rate.origin,0.001639
...,...,...
485,difference_drive_retail,0.000582
486,difference_drive_retail,0.000590
487,difference_drive_retail,0.000573
488,difference_drive_retail,0.000667


In [46]:
from sklearn.inspection import permutation_importance

X_train = dataset_in["train"]["full"].copy()
y_train = dataset_in["train"]["y"]

In [54]:
model = LinearRegression().fit(X_train, y_train)
result = permutation_importance(model, X_train, y_train)
del result["importances"]

df = pd.DataFrame(result)
df["feature"] = features

In [55]:
df

Unnamed: 0,importances_mean,importances_std,feature
0,69234270000000.0,379459600000.0,nocentralheating_rate.origin
1,3.831506e-06,2.515421e-06,difference_crime_rate
2,7.34796e-05,1.005529e-05,difference_PT_post
3,128099000000000.0,802345100000.0,difference_nocentralheating_rate
4,2.225756e-05,7.775502e-06,crime_rate.origin
5,6.108339e-05,1.674707e-05,difference_drive_petrol
6,0.01016198,0.0001996536,Dist_from_Ed.origin
7,0.003079608,0.0001379333,Dist_from_Gls.destination
8,8.031955e-06,4.022589e-06,difference_overcrowded_rate
9,0.005885773,0.0001208112,employment_rate.origin
