# Permutation Importance on Model M036

In [43]:
import pandas as pd
import catboost
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import numpy as np

In [47]:
def permutation_importance(model, X_val, y_val, metric, threshold=0.005,
                           minimize=True, verbose=True):
    results = {}
    
    y_pred = model.predict(X_val)
    
    results['base_score'] = metric(y_val, y_pred)
    if verbose:
        print(f'Base score {results["base_score"]:.5}')

    
    for col in tqdm(X_val.columns):
        freezed_col = X_val[col].copy()

        X_val[col] = np.random.permutation(X_val[col])
        preds = model.predict(X_val)
        results[col] = metric(y_val, preds)

        X_val[col] = freezed_col
        
        if verbose:
            print(f'column: {col} - {results[col]:.5}')
    
    if minimize:
        bad_features = [k for k in results if results[k] < results['base_score'] + threshold]
    else:
        bad_features = [k for k in results if results[k] > results['base_score'] + threshold]
    bad_features.remove('base_score')
    
    return results, bad_features

In [4]:
model = catboost.CatBoostRegressor()

In [13]:
model.load_model('../models/M036/M036-0705_2317-1JHC-1.model')

<catboost.core.CatBoostRegressor at 0x7fcfa9eeb080>

In [14]:
df = pd.read_parquet('../data/FE014/FE014-train-1JHC.parquet')

In [22]:
train_df = pd.read_parquet('../data/FE008_train.parquet') # only loading for skeleton not features
mol_group = train_df[['molecule_name','type']].copy()

In [None]:
N_FOLDS = 2
folds = GroupKFold(n_splits=N_FOLDS)
bond_type = '1JHC'
X = df[model.feature_names_]
y = df['scalar_coupling_constant']
mol_group_type = mol_group.loc[mol_group['type'] == bond_type]['molecule_name']

for fold_n, (train_idx, valid_idx) in enumerate(folds.split(X, groups=mol_group_type)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    X_valid = X_valid.copy()
    model = catboost.CatBoostRegressor()
    model.load_model('../models/M036/M036-0705_2317-1JHC-{}.model'.format(fold_n + 1))
    results, bad_features = permutation_importance(model, X_valid, y_valid, metric=mean_absolute_error)
    break



  0%|          | 0/178 [00:00<?, ?it/s][A[A

Base score 0.61331




  1%|          | 1/178 [00:31<1:34:20, 31.98s/it][A[A

column: atom1_valence - 1.776




  1%|          | 2/178 [01:06<1:36:07, 32.77s/it][A[A

column: atom1_spin_multiplicity - 0.6139




  2%|▏         | 3/178 [01:39<1:35:36, 32.78s/it][A[A

column: distance - 1.1325




  2%|▏         | 4/178 [02:14<1:36:51, 33.40s/it][A[A

column: is_bond - 0.61331




  3%|▎         | 5/178 [02:49<1:37:29, 33.81s/it][A[A

column: distance_closest_to_0 - 2.5886




  3%|▎         | 6/178 [03:22<1:36:48, 33.77s/it][A[A

column: distance_2nd_closest_to_0 - 1.2727




  4%|▍         | 7/178 [03:54<1:34:51, 33.28s/it][A[A

column: distance_3rd_closest_to_0 - 1.0334




  4%|▍         | 8/178 [04:30<1:36:14, 33.97s/it][A[A

column: distance_4th_closest_to_0 - 0.86064




  5%|▌         | 9/178 [05:01<1:33:27, 33.18s/it][A[A

column: distance_5th_closest_to_0 - 0.80507




  6%|▌         | 10/178 [05:34<1:32:48, 33.15s/it][A[A

column: distance_6th_closest_to_0 - 0.79607




  6%|▌         | 11/178 [06:06<1:31:03, 32.72s/it][A[A

column: distance_7th_closest_to_0 - 0.82952




  7%|▋         | 12/178 [06:38<1:29:30, 32.35s/it][A[A

column: distance_8th_closest_to_0 - 0.79031




  7%|▋         | 13/178 [07:09<1:28:32, 32.20s/it][A[A

column: distance_9th_closest_to_0 - 0.72495




  8%|▊         | 14/178 [07:42<1:28:03, 32.21s/it][A[A

column: distance_10th_closest_to_0 - 0.70719




  8%|▊         | 15/178 [08:13<1:26:50, 31.97s/it][A[A

column: distance_closest_to_1 - 1.1037




  9%|▉         | 16/178 [08:44<1:25:43, 31.75s/it][A[A

column: distance_2nd_closest_to_1 - 1.8414




 10%|▉         | 17/178 [09:18<1:26:36, 32.27s/it][A[A

column: distance_3rd_closest_to_1 - 1.6567




 10%|█         | 18/178 [09:49<1:25:30, 32.07s/it][A[A

column: is_bond_3rd_closest_to_1 - 1.2667




 11%|█         | 19/178 [10:20<1:23:40, 31.57s/it][A[A

column: distance_4th_closest_to_1 - 1.0289




 11%|█         | 20/178 [10:50<1:22:25, 31.30s/it][A[A

column: distance_5th_closest_to_1 - 0.99229




 12%|█▏        | 21/178 [11:23<1:22:46, 31.63s/it][A[A

column: distance_6th_closest_to_1 - 0.88798




 12%|█▏        | 22/178 [11:52<1:20:33, 30.98s/it][A[A

column: distance_7th_closest_to_1 - 0.87393




 13%|█▎        | 23/178 [12:23<1:19:50, 30.90s/it][A[A

column: distance_8th_closest_to_1 - 1.0455




 13%|█▎        | 24/178 [12:53<1:18:18, 30.51s/it][A[A

column: distance_9th_closest_to_1 - 0.92209




 14%|█▍        | 25/178 [13:24<1:18:27, 30.77s/it][A[A

column: distance_10th_closest_to_1 - 0.88003




 15%|█▍        | 26/178 [13:58<1:20:25, 31.74s/it][A[A

column: closest_to_0_atomic_mass - 0.61331




 15%|█▌        | 27/178 [14:28<1:18:45, 31.30s/it][A[A

column: closest_to_0_valence - 1.6835




 16%|█▌        | 28/178 [14:59<1:17:46, 31.11s/it][A[A

column: closest_to_0_spin_multiplicity - 0.62148




 16%|█▋        | 29/178 [15:29<1:16:39, 30.87s/it][A[A

column: 2nd_closest_to_0_atomic_mass - 2.1351




 17%|█▋        | 30/178 [15:58<1:14:56, 30.38s/it][A[A

column: 2nd_closest_to_0_valence - 0.6762




 17%|█▋        | 31/178 [16:31<1:16:11, 31.10s/it][A[A

column: 2nd_closest_to_0_spin_multiplicity - 0.61361




 18%|█▊        | 32/178 [17:00<1:13:40, 30.28s/it][A[A

column: 3rd_closest_to_0_atomic_mass - 0.65557




 19%|█▊        | 33/178 [17:32<1:14:27, 30.81s/it][A[A

column: 3rd_closest_to_0_valence - 1.0774




 19%|█▉        | 34/178 [18:02<1:13:32, 30.64s/it][A[A

column: 3rd_closest_to_0_spin_multiplicity - 0.61615




 20%|█▉        | 35/178 [18:33<1:13:15, 30.74s/it][A[A

column: 4th_closest_to_0_atomic_mass - 0.63553




 20%|██        | 36/178 [19:03<1:12:02, 30.44s/it][A[A

column: 4th_closest_to_0_valence - 0.71002
