# Permutation Importance on Model M040
- Per Type, find features which do not help the model

In [2]:
import pandas as pd
import catboost
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import numpy as np
import gc

In [3]:
def permutation_importance(model, X_val, y_val, metric, threshold=0.005,
                           minimize=True, verbose=True):
    """
    Reference: https://www.kaggle.com/speedwagon/permutation-importance
    
    Modified by Rob
    """
    results = {}
    
    y_pred = model.predict(X_val)
    
    results['base_score'] = metric(y_val, y_pred)
    if verbose:
        print(f'Base score {results["base_score"]:.5}')

    
    for col in tqdm(X_val.columns):
        freezed_col = X_val[col].copy()

        X_val[col] = np.random.permutation(X_val[col])
        preds = model.predict(X_val, thread_count=20)
        results[col] = metric(y_val, preds)

        X_val[col] = freezed_col
        
        if verbose:
            print(f'column: {col} - {results[col]:.5}')
    
    if minimize:
        bad_features = [k for k in results if results[k] < results['base_score'] + threshold]
    else:
        bad_features = [k for k in results if results[k] > results['base_score'] + threshold]
    bad_features.remove('base_score')
    
    return results, bad_features

In [12]:
types = ['3JHN', '2JHC', '3JHC', '2JHH', '2JHN', '1JHN', '3JHH', '1JHC']
results_dict = {}
bad_features_dict = {}
for t in types:
    train = pd.read_parquet(f'../data/FE016/FE016-train-{t}.parquet')
    x = pd.read_parquet('../data/FE008_train.parquet') # only loading for skeleton not features
    model = catboost.CatBoostRegressor()
    model.load_model('../models/M040/M040-0708_1639-{}-{}.model'.format(t, 1))
    mol_group = x[['molecule_name','type']].copy()
    N_FOLDS = 2
    folds = GroupKFold(n_splits=N_FOLDS)
    X = train[model.feature_names_]
    y = train['scalar_coupling_constant']
    mol_group_type = mol_group.loc[mol_group['type'] == t]['molecule_name']
    for fold_n, (train_idx, valid_idx) in enumerate(folds.split(X, groups=mol_group_type)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        X_valid = X_valid.copy()
        results, bad_features = permutation_importance(model, X_valid, y_valid, metric=mean_absolute_error)
        results_dict[f] = results
        bad_features_dict[f] = bad_features
        break # Only run for the first fold
    print(f'Bad features for {t}:')
    print(bad_features)
    del train, x, X, y, mol_group_type
    gc.collect()

  0%|          | 0/214 [00:00<?, ?it/s]

Base score 0.11625


  0%|          | 1/214 [00:08<30:08,  8.49s/it]

column: atom1_valence - 0.13909


  1%|          | 2/214 [00:17<30:37,  8.67s/it]

column: distance - 0.3041


  1%|▏         | 3/214 [00:26<30:46,  8.75s/it]

column: distance_closest_to_0 - 0.13149


  2%|▏         | 4/214 [00:33<28:43,  8.21s/it]

column: distance_2nd_closest_to_0 - 0.20351


  2%|▏         | 5/214 [00:42<29:28,  8.46s/it]

column: distance_3rd_closest_to_0 - 0.13593


  3%|▎         | 6/214 [00:51<29:40,  8.56s/it]

column: distance_4th_closest_to_0 - 0.13593


  3%|▎         | 7/214 [00:57<27:36,  8.00s/it]

column: distance_5th_closest_to_0 - 0.12419


  4%|▎         | 8/214 [01:04<25:58,  7.57s/it]

column: distance_6th_closest_to_0 - 0.12048


  4%|▍         | 9/214 [01:12<26:33,  7.77s/it]

column: distance_7th_closest_to_0 - 0.11972


  5%|▍         | 10/214 [01:19<25:38,  7.54s/it]

column: distance_8th_closest_to_0 - 0.11868


  5%|▌         | 11/214 [01:26<24:28,  7.23s/it]

column: distance_9th_closest_to_0 - 0.11977


  6%|▌         | 12/214 [01:32<23:28,  6.97s/it]

column: distance_10th_closest_to_0 - 0.11788


  6%|▌         | 13/214 [01:39<23:33,  7.03s/it]

column: distance_closest_to_1 - 0.12886


  7%|▋         | 14/214 [01:46<23:01,  6.91s/it]

column: distance_2nd_closest_to_1 - 0.12686


  7%|▋         | 15/214 [01:53<23:00,  6.94s/it]

column: distance_3rd_closest_to_1 - 0.20133


  7%|▋         | 16/214 [02:02<24:57,  7.57s/it]

column: distance_4th_closest_to_1 - 0.12765


  8%|▊         | 17/214 [02:10<25:33,  7.78s/it]

column: distance_5th_closest_to_1 - 0.1375


  8%|▊         | 18/214 [02:17<24:07,  7.39s/it]

column: distance_6th_closest_to_1 - 0.12152


  9%|▉         | 19/214 [02:25<24:38,  7.58s/it]

column: distance_7th_closest_to_1 - 0.12805


  9%|▉         | 20/214 [02:33<24:44,  7.65s/it]

column: distance_8th_closest_to_1 - 0.12095


 10%|▉         | 21/214 [02:39<23:29,  7.30s/it]

column: distance_9th_closest_to_1 - 0.11988


 10%|█         | 22/214 [02:47<23:31,  7.35s/it]

column: distance_10th_closest_to_1 - 0.11946


 11%|█         | 23/214 [02:55<24:17,  7.63s/it]

column: closest_to_0_atomic_mass - 0.11625


 11%|█         | 24/214 [03:02<23:28,  7.42s/it]

column: closest_to_0_valence - 0.14161


 12%|█▏        | 25/214 [03:09<23:34,  7.49s/it]

column: closest_to_0_spin_multiplicity - 0.11658


 12%|█▏        | 26/214 [03:16<23:02,  7.35s/it]

column: 2nd_closest_to_0_atomic_mass - 0.118


 13%|█▎        | 27/214 [03:23<22:08,  7.10s/it]

column: 2nd_closest_to_0_valence - 0.12872


 13%|█▎        | 28/214 [03:30<22:13,  7.17s/it]

column: 2nd_closest_to_0_spin_multiplicity - 0.11652


 14%|█▎        | 29/214 [03:36<21:06,  6.85s/it]

column: 3rd_closest_to_0_atomic_mass - 0.12057


 14%|█▍        | 30/214 [03:43<20:29,  6.68s/it]

column: 3rd_closest_to_0_valence - 0.12802


 14%|█▍        | 31/214 [03:49<19:40,  6.45s/it]

column: 3rd_closest_to_0_spin_multiplicity - 0.11644


 15%|█▍        | 32/214 [03:54<18:50,  6.21s/it]

column: 4th_closest_to_0_atomic_mass - 0.11678


 15%|█▌        | 33/214 [04:00<18:15,  6.05s/it]

column: 4th_closest_to_0_valence - 0.12205


 16%|█▌        | 34/214 [04:05<17:34,  5.86s/it]

column: 4th_closest_to_0_spin_multiplicity - 0.1167


 16%|█▋        | 35/214 [04:10<16:47,  5.63s/it]

column: 5th_closest_to_0_atomic_mass - 0.11672


 17%|█▋        | 36/214 [04:15<15:30,  5.23s/it]

column: 5th_closest_to_0_valence - 0.11645


 17%|█▋        | 37/214 [04:19<14:38,  4.97s/it]

column: 5th_closest_to_0_spin_multiplicity - 0.11625


 18%|█▊        | 38/214 [04:24<14:05,  4.80s/it]

column: 6th_closest_to_0_atomic_mass - 0.1164


 18%|█▊        | 39/214 [04:31<16:46,  5.75s/it]

column: 6th_closest_to_0_valence - 0.11649


 19%|█▊        | 40/214 [04:38<17:30,  6.04s/it]

column: 6th_closest_to_0_spin_multiplicity - 0.11645


 19%|█▉        | 41/214 [04:45<18:16,  6.34s/it]

column: 7th_closest_to_0_atomic_mass - 0.11634


 20%|█▉        | 42/214 [04:52<18:55,  6.60s/it]

column: 7th_closest_to_0_valence - 0.11651


 20%|██        | 43/214 [05:01<20:23,  7.15s/it]

column: 7th_closest_to_0_spin_multiplicity - 0.11631


 21%|██        | 44/214 [05:08<20:08,  7.11s/it]

column: 8th_closest_to_0_atomic_mass - 0.11636


 21%|██        | 45/214 [05:15<20:23,  7.24s/it]

column: 8th_closest_to_0_valence - 0.11646


 21%|██▏       | 46/214 [05:23<20:37,  7.37s/it]

column: 8th_closest_to_0_spin_multiplicity - 0.11631


 22%|██▏       | 47/214 [05:31<20:44,  7.45s/it]

column: 9th_closest_to_0_atomic_mass - 0.11631


 22%|██▏       | 48/214 [05:39<21:16,  7.69s/it]

column: 9th_closest_to_0_valence - 0.11651


 23%|██▎       | 49/214 [05:47<21:44,  7.90s/it]

column: 9th_closest_to_0_spin_multiplicity - 0.11629


 23%|██▎       | 50/214 [05:56<22:19,  8.17s/it]

column: 10th_closest_to_0_atomic_mass - 0.11628


 24%|██▍       | 51/214 [06:04<22:19,  8.22s/it]

column: 10th_closest_to_0_valence - 0.11654


 24%|██▍       | 52/214 [06:12<21:50,  8.09s/it]

column: 10th_closest_to_0_spin_multiplicity - 0.11628


 25%|██▍       | 53/214 [06:21<21:50,  8.14s/it]

column: closest_to_1_atomic_mass - 0.11642


 25%|██▌       | 54/214 [06:29<21:47,  8.17s/it]

column: closest_to_1_valence - 0.12362


 26%|██▌       | 55/214 [06:37<21:37,  8.16s/it]

column: closest_to_1_spin_multiplicity - 0.11702


 26%|██▌       | 56/214 [06:45<21:44,  8.26s/it]

column: 2nd_closest_to_1_atomic_mass - 0.11952


 27%|██▋       | 57/214 [06:53<20:41,  7.91s/it]

column: 2nd_closest_to_1_valence - 0.12439


 27%|██▋       | 58/214 [06:59<19:31,  7.51s/it]

column: 2nd_closest_to_1_spin_multiplicity - 0.11666


 28%|██▊       | 59/214 [07:07<19:51,  7.68s/it]

column: 3rd_closest_to_1_atomic_mass - 0.11628


 28%|██▊       | 60/214 [07:14<19:17,  7.52s/it]

column: 3rd_closest_to_1_valence - 0.11685


 29%|██▊       | 61/214 [07:21<18:21,  7.20s/it]

column: 3rd_closest_to_1_spin_multiplicity - 0.11631


 29%|██▉       | 62/214 [07:28<18:08,  7.16s/it]

column: 4th_closest_to_1_atomic_mass - 0.11759


 29%|██▉       | 63/214 [07:35<17:46,  7.07s/it]

column: 4th_closest_to_1_valence - 0.12591


 30%|██▉       | 64/214 [07:42<17:52,  7.15s/it]

column: 4th_closest_to_1_spin_multiplicity - 0.11627


 30%|███       | 65/214 [07:49<17:22,  7.00s/it]

column: 5th_closest_to_1_atomic_mass - 0.11792


 31%|███       | 66/214 [07:55<16:47,  6.81s/it]

column: 5th_closest_to_1_valence - 0.1178


 31%|███▏      | 67/214 [08:02<16:49,  6.87s/it]

column: 5th_closest_to_1_spin_multiplicity - 0.11631


 32%|███▏      | 68/214 [08:10<17:28,  7.18s/it]

column: 6th_closest_to_1_atomic_mass - 0.1167


 32%|███▏      | 69/214 [08:18<18:08,  7.50s/it]

column: 6th_closest_to_1_valence - 0.11687


 33%|███▎      | 70/214 [08:24<17:02,  7.10s/it]

column: 6th_closest_to_1_spin_multiplicity - 0.11637


 33%|███▎      | 71/214 [08:32<17:22,  7.29s/it]

column: 7th_closest_to_1_atomic_mass - 0.11662


 34%|███▎      | 72/214 [08:39<16:43,  7.07s/it]

column: 7th_closest_to_1_valence - 0.11676


 34%|███▍      | 73/214 [08:46<16:50,  7.17s/it]

column: 7th_closest_to_1_spin_multiplicity - 0.1163


 35%|███▍      | 74/214 [08:53<16:18,  6.99s/it]

column: 8th_closest_to_1_atomic_mass - 0.11659


 35%|███▌      | 75/214 [09:00<16:23,  7.08s/it]

column: 8th_closest_to_1_valence - 0.11645


 36%|███▌      | 76/214 [09:07<15:57,  6.94s/it]

column: 8th_closest_to_1_spin_multiplicity - 0.11631


 36%|███▌      | 77/214 [09:14<16:31,  7.24s/it]

column: 9th_closest_to_1_atomic_mass - 0.11635


 36%|███▋      | 78/214 [09:21<16:09,  7.13s/it]

column: 9th_closest_to_1_valence - 0.11632


 37%|███▋      | 79/214 [09:29<16:36,  7.38s/it]

column: 9th_closest_to_1_spin_multiplicity - 0.11629


 37%|███▋      | 80/214 [09:37<16:35,  7.43s/it]

column: 10th_closest_to_1_atomic_mass - 0.11637


 38%|███▊      | 81/214 [09:44<16:32,  7.46s/it]

column: 10th_closest_to_1_valence - 0.11633


 38%|███▊      | 82/214 [09:51<15:48,  7.19s/it]

column: 10th_closest_to_1_spin_multiplicity - 0.11627


 39%|███▉      | 83/214 [09:59<16:20,  7.48s/it]

column: mol_wt - 0.11687


 39%|███▉      | 84/214 [10:05<15:15,  7.04s/it]

column: num_atoms - 0.11643


 40%|███▉      | 85/214 [10:12<15:01,  6.99s/it]

column: num_bonds - 0.1164


 40%|████      | 86/214 [10:19<15:05,  7.08s/it]

column: closest_to_0_dist_x_atomic_mass - 0.13449


 41%|████      | 87/214 [10:27<15:31,  7.34s/it]

column: 2nd_closest_to_0_dist_x_atomic_mass - 0.23125


 41%|████      | 88/214 [10:34<15:11,  7.24s/it]

column: 3rd_closest_to_0_dist_x_atomic_mass - 0.2054


 42%|████▏     | 89/214 [10:42<15:32,  7.46s/it]

column: 4th_closest_to_0_dist_x_atomic_mass - 0.13315


 42%|████▏     | 90/214 [10:51<16:11,  7.84s/it]

column: 5th_closest_to_0_dist_x_atomic_mass - 0.11989


 43%|████▎     | 91/214 [10:57<15:15,  7.44s/it]

column: 6th_closest_to_0_dist_x_atomic_mass - 0.11874


 43%|████▎     | 92/214 [11:04<14:52,  7.31s/it]

column: 7th_closest_to_0_dist_x_atomic_mass - 0.1184


 43%|████▎     | 93/214 [11:12<14:48,  7.34s/it]

column: 8th_closest_to_0_dist_x_atomic_mass - 0.11806


 44%|████▍     | 94/214 [11:20<14:55,  7.46s/it]

column: 9th_closest_to_0_dist_x_atomic_mass - 0.11767


 44%|████▍     | 95/214 [11:26<14:16,  7.19s/it]

column: 10th_closest_to_0_dist_x_atomic_mass - 0.11753


 45%|████▍     | 96/214 [11:33<13:57,  7.09s/it]

column: closest_to_1_dist_x_atomic_mass - 0.12097


 45%|████▌     | 97/214 [11:41<14:06,  7.23s/it]

column: 2nd_closest_to_1_dist_x_atomic_mass - 0.13344


 46%|████▌     | 98/214 [11:48<14:03,  7.27s/it]

column: 3rd_closest_to_1_dist_x_atomic_mass - 0.12698


 46%|████▋     | 99/214 [11:56<14:21,  7.49s/it]

column: 4th_closest_to_1_dist_x_atomic_mass - 0.12312


 47%|████▋     | 100/214 [12:04<14:35,  7.68s/it]

column: 5th_closest_to_1_dist_x_atomic_mass - 0.12303


 47%|████▋     | 101/214 [12:11<14:02,  7.46s/it]

column: 6th_closest_to_1_dist_x_atomic_mass - 0.12095


 48%|████▊     | 102/214 [12:20<14:31,  7.78s/it]

column: 7th_closest_to_1_dist_x_atomic_mass - 0.11897


 48%|████▊     | 103/214 [12:27<14:10,  7.66s/it]

column: 8th_closest_to_1_dist_x_atomic_mass - 0.11847


 49%|████▊     | 104/214 [12:35<14:24,  7.86s/it]

column: 9th_closest_to_1_dist_x_atomic_mass - 0.11812


 49%|████▉     | 105/214 [12:42<13:39,  7.52s/it]

column: 10th_closest_to_1_dist_x_atomic_mass - 0.11784


 50%|████▉     | 106/214 [12:49<13:31,  7.52s/it]

column: angle_clos_0_2nd - 0.13609


 50%|█████     | 107/214 [12:58<13:49,  7.75s/it]

column: angle_clos_1_2nd - 0.13391


 50%|█████     | 108/214 [13:04<13:06,  7.42s/it]

column: N1 - 0.11698


 51%|█████     | 109/214 [13:11<12:33,  7.17s/it]

column: N2 - 0.11643


 51%|█████▏    | 110/214 [13:18<12:12,  7.04s/it]

column: link0 - 0.11836


 52%|█████▏    | 111/214 [13:25<12:17,  7.16s/it]

column: link1 - 0.11831


 52%|█████▏    | 112/214 [13:31<11:37,  6.84s/it]

column: linkN - 0.11749


 53%|█████▎    | 113/214 [13:38<11:41,  6.94s/it]

column: inv_dist0 - 0.12033


 53%|█████▎    | 114/214 [13:46<11:43,  7.03s/it]

column: inv_dist1 - 0.11959


 54%|█████▎    | 115/214 [13:52<11:25,  6.93s/it]

column: inv_distP - 0.11775


 54%|█████▍    | 116/214 [14:00<11:28,  7.02s/it]

column: inv_dist0R - 0.12785


 55%|█████▍    | 117/214 [14:07<11:33,  7.15s/it]

column: inv_dist1R - 0.13523


 55%|█████▌    | 118/214 [14:15<11:45,  7.35s/it]

column: inv_distPR - 0.12789


 56%|█████▌    | 119/214 [14:22<11:19,  7.15s/it]

column: inv_dist0E - 0.11817


 56%|█████▌    | 120/214 [14:30<11:36,  7.41s/it]

column: inv_dist1E - 0.11844


 57%|█████▋    | 121/214 [14:37<11:27,  7.40s/it]

column: inv_distPE - 0.11802


 57%|█████▋    | 122/214 [14:45<11:43,  7.64s/it]

column: linkM0 - 0.11927


 57%|█████▋    | 123/214 [14:54<11:54,  7.85s/it]

column: linkM1 - 0.12002


 58%|█████▊    | 124/214 [15:00<11:21,  7.57s/it]

column: min_molecule_atom_0_dist_xyz - 0.13782


 58%|█████▊    | 125/214 [15:08<11:04,  7.46s/it]

column: mean_molecule_atom_0_dist_xyz - 0.12398


 59%|█████▉    | 126/214 [15:16<11:14,  7.66s/it]

column: max_molecule_atom_0_dist_xyz - 0.14451


 59%|█████▉    | 127/214 [15:24<11:18,  7.79s/it]

column: sd_molecule_atom_0_dist_xyz - 0.11767


 60%|█████▉    | 128/214 [15:32<11:25,  7.98s/it]

column: min_molecule_atom_1_dist_xyz - 0.12118


 60%|██████    | 129/214 [15:40<11:22,  8.03s/it]

column: mean_molecule_atom_1_dist_xyz - 0.11783


 61%|██████    | 130/214 [15:49<11:32,  8.25s/it]

column: max_molecule_atom_1_dist_xyz - 0.12216


 61%|██████    | 131/214 [15:56<10:40,  7.71s/it]

column: sd_molecule_atom_1_dist_xyz - 0.11922


 62%|██████▏   | 132/214 [16:03<10:33,  7.72s/it]

column: coulomb_C.x - 0.11685


 62%|██████▏   | 133/214 [16:11<10:18,  7.64s/it]

column: coulomb_F.x - 0.11627


 63%|██████▎   | 134/214 [16:19<10:19,  7.75s/it]

column: coulomb_H.x - 0.11743


 63%|██████▎   | 135/214 [16:27<10:10,  7.73s/it]

column: coulomb_N.x - 0.11842


 64%|██████▎   | 136/214 [16:35<10:10,  7.83s/it]

column: coulomb_O.x - 0.11777


 64%|██████▍   | 137/214 [16:43<10:12,  7.95s/it]

column: yukawa_C.x - 0.12836


 64%|██████▍   | 138/214 [16:50<09:54,  7.82s/it]

column: yukawa_F.x - 0.11667


 65%|██████▍   | 139/214 [16:57<09:26,  7.56s/it]

column: yukawa_H.x - 0.11874


 65%|██████▌   | 140/214 [17:04<09:05,  7.37s/it]

column: yukawa_N.x - 0.12345


 66%|██████▌   | 141/214 [17:11<08:48,  7.24s/it]

column: yukawa_O.x - 0.1246


 66%|██████▋   | 142/214 [17:17<08:09,  6.79s/it]

column: coulomb_C.y - 0.117


 67%|██████▋   | 143/214 [17:23<07:41,  6.50s/it]

column: coulomb_F.y - 0.11628


 67%|██████▋   | 144/214 [17:30<07:43,  6.63s/it]

column: coulomb_H.y - 0.11715


 68%|██████▊   | 145/214 [17:37<07:53,  6.86s/it]

column: coulomb_N.y - 0.11804


 68%|██████▊   | 146/214 [17:43<07:29,  6.61s/it]

column: coulomb_O.y - 0.11799


 69%|██████▊   | 147/214 [17:49<07:07,  6.39s/it]

column: yukawa_C.y - 0.12231


 69%|██████▉   | 148/214 [17:55<07:00,  6.37s/it]

column: yukawa_F.y - 0.11663


 70%|██████▉   | 149/214 [18:01<06:51,  6.33s/it]

column: yukawa_H.y - 0.12095


 70%|███████   | 150/214 [18:07<06:24,  6.00s/it]

column: yukawa_N.y - 0.11964


 71%|███████   | 151/214 [18:12<06:01,  5.73s/it]

column: yukawa_O.y - 0.126


 71%|███████   | 152/214 [18:16<05:35,  5.41s/it]

column: distN0 - 0.12919


 71%|███████▏  | 153/214 [18:21<05:19,  5.23s/it]

column: distN1 - 0.11763


 72%|███████▏  | 154/214 [18:26<05:01,  5.02s/it]

column: adH1 - 0.11805


 72%|███████▏  | 155/214 [18:30<04:45,  4.83s/it]

column: adH2 - 0.11734


 73%|███████▎  | 156/214 [18:35<04:32,  4.69s/it]

column: adH3 - 0.11647


 73%|███████▎  | 157/214 [18:39<04:23,  4.62s/it]

column: adH4 - 0.11632


 74%|███████▍  | 158/214 [18:43<04:11,  4.49s/it]

column: adC1 - 0.12824


 74%|███████▍  | 159/214 [18:47<03:59,  4.36s/it]

column: adC2 - 0.16828


 75%|███████▍  | 160/214 [18:51<03:52,  4.31s/it]

column: adC3 - 0.12755


 75%|███████▌  | 161/214 [18:56<03:45,  4.26s/it]

column: adC4 - 0.11969


 76%|███████▌  | 162/214 [19:00<03:37,  4.19s/it]

column: adN1 - 0.11832


 76%|███████▌  | 163/214 [19:04<03:34,  4.20s/it]

column: adN2 - 0.11671


 77%|███████▋  | 164/214 [19:08<03:35,  4.31s/it]

column: adN3 - 0.11628


 77%|███████▋  | 165/214 [19:13<03:29,  4.27s/it]

column: adN4 - 0.11625


 78%|███████▊  | 166/214 [19:17<03:29,  4.36s/it]

column: NC - 0.11631


 78%|███████▊  | 167/214 [19:22<03:25,  4.37s/it]

column: NH - 0.11688


 79%|███████▊  | 168/214 [19:26<03:24,  4.45s/it]

column: NN - 0.11635


 79%|███████▉  | 169/214 [19:31<03:23,  4.52s/it]

column: NF - 0.11625


 79%|███████▉  | 170/214 [19:36<03:24,  4.64s/it]

column: NO - 0.11718


 80%|███████▉  | 171/214 [19:40<03:19,  4.64s/it]

column: angle_1_0_closest0 - 0.19627


 80%|████████  | 172/214 [19:45<03:14,  4.63s/it]

column: angle_1_0_2nd0 - 0.16104


 81%|████████  | 173/214 [19:50<03:08,  4.61s/it]

column: angle_1_0_3rd0 - 0.1548


 81%|████████▏ | 174/214 [19:54<03:06,  4.66s/it]

column: angle_1_0_4th0 - 0.12268


 82%|████████▏ | 175/214 [19:59<03:01,  4.65s/it]

column: angle_1_0_5th0 - 0.11938


 82%|████████▏ | 176/214 [20:04<02:57,  4.66s/it]

column: angle_0_1_closest1 - 0.25552


 83%|████████▎ | 177/214 [20:08<02:49,  4.57s/it]

column: angle_0_1_2nd1 - 0.18178


 83%|████████▎ | 178/214 [20:13<02:43,  4.55s/it]

column: angle_0_1_3rd1 - 0.15624


 84%|████████▎ | 179/214 [20:17<02:38,  4.52s/it]

column: angle_0_1_4th1 - 0.12526


 84%|████████▍ | 180/214 [20:21<02:30,  4.43s/it]

column: angle_0_1_5th1 - 0.12218


 85%|████████▍ | 181/214 [20:26<02:30,  4.55s/it]

column: angle_1_0_6th0 - 0.11837


 85%|████████▌ | 182/214 [20:31<02:25,  4.54s/it]

column: angle_1_0_7th0 - 0.11772


 86%|████████▌ | 183/214 [20:35<02:20,  4.54s/it]

column: angle_1_0_8th0 - 0.11872


 86%|████████▌ | 184/214 [20:40<02:15,  4.50s/it]

column: angle_1_0_9th0 - 0.11729


 86%|████████▋ | 185/214 [20:44<02:08,  4.45s/it]

column: angle_1_0_10th0 - 0.11735


 87%|████████▋ | 186/214 [20:48<02:05,  4.47s/it]

column: angle_1_0_11th0 - 0.11763


 87%|████████▋ | 187/214 [20:53<02:02,  4.52s/it]

column: angle_1_0_6th1 - 0.11999


 88%|████████▊ | 188/214 [20:58<01:57,  4.54s/it]

column: angle_1_0_7th1 - 0.11817


 88%|████████▊ | 189/214 [21:02<01:50,  4.42s/it]

column: angle_1_0_8th1 - 0.11859


 89%|████████▉ | 190/214 [21:06<01:44,  4.34s/it]

column: angle_1_0_9th1 - 0.11816


 89%|████████▉ | 191/214 [21:10<01:39,  4.31s/it]

column: angle_1_0_10th1 - 0.11782


 90%|████████▉ | 192/214 [21:15<01:37,  4.41s/it]

column: angle_1_0_11th1 - 0.11703


 90%|█████████ | 193/214 [21:19<01:32,  4.40s/it]

column: angle_0_closest0_1 - 0.79014


 91%|█████████ | 194/214 [21:23<01:26,  4.34s/it]

column: angle_0_2nd0_1 - 0.23741


 91%|█████████ | 195/214 [21:27<01:21,  4.26s/it]

column: angle_0_3rd0_1 - 0.1536


 92%|█████████▏| 196/214 [21:32<01:17,  4.31s/it]

column: angle_0_4th0_1 - 0.13117


 92%|█████████▏| 197/214 [21:36<01:14,  4.38s/it]

column: angle_0_5th0_1 - 0.12212


 93%|█████████▎| 198/214 [21:41<01:11,  4.50s/it]

column: angle_0_6th0_1 - 0.11902


 93%|█████████▎| 199/214 [21:46<01:07,  4.52s/it]

column: angle_0_7th0_1 - 0.11757


 93%|█████████▎| 200/214 [21:50<01:03,  4.52s/it]

column: angle_0_8th0_1 - 0.11746


 94%|█████████▍| 201/214 [21:55<00:57,  4.45s/it]

column: angle_0_9th0_1 - 0.11708


 94%|█████████▍| 202/214 [21:59<00:52,  4.36s/it]

column: angle_0_10th0_1 - 0.11716


 95%|█████████▍| 203/214 [22:03<00:47,  4.29s/it]

column: angle_0_11th0_1 - 0.11702


 95%|█████████▌| 204/214 [22:07<00:42,  4.20s/it]

column: angle_0_closest1_1 - 0.19606


 96%|█████████▌| 205/214 [22:11<00:38,  4.29s/it]

column: angle_0_2nd1_1 - 0.1735


 96%|█████████▋| 206/214 [22:16<00:34,  4.35s/it]

column: angle_0_3rd1_1 - 0.1312


 97%|█████████▋| 207/214 [22:20<00:30,  4.39s/it]

column: angle_0_4th1_1 - 0.12001


 97%|█████████▋| 208/214 [22:25<00:26,  4.46s/it]

column: angle_0_5th1_1 - 0.11855


 98%|█████████▊| 209/214 [22:29<00:22,  4.47s/it]

column: angle_0_6th1_1 - 0.12119


 98%|█████████▊| 210/214 [22:34<00:17,  4.37s/it]

column: angle_0_7th1_1 - 0.1196


 99%|█████████▊| 211/214 [22:38<00:13,  4.40s/it]

column: angle_0_8th1_1 - 0.1178


 99%|█████████▉| 212/214 [22:42<00:08,  4.36s/it]

column: angle_0_9th1_1 - 0.11665


100%|█████████▉| 213/214 [22:47<00:04,  4.37s/it]

column: angle_0_10th1_1 - 0.11644


100%|██████████| 214/214 [22:51<00:00,  4.37s/it]

column: angle_0_11th1_1 - 0.1163





NameError: name 'f' is not defined

In [13]:
results_dict[t] = results
bad_features_dict[t] = bad_features

In [14]:
print(f'Bad features for {t}:')
print(bad_features)

Bad features for 3JHN:
['distance_6th_closest_to_0', 'distance_7th_closest_to_0', 'distance_8th_closest_to_0', 'distance_9th_closest_to_0', 'distance_10th_closest_to_0', 'distance_8th_closest_to_1', 'distance_9th_closest_to_1', 'distance_10th_closest_to_1', 'closest_to_0_atomic_mass', 'closest_to_0_spin_multiplicity', '2nd_closest_to_0_atomic_mass', '2nd_closest_to_0_spin_multiplicity', '3rd_closest_to_0_atomic_mass', '3rd_closest_to_0_spin_multiplicity', '4th_closest_to_0_atomic_mass', '4th_closest_to_0_spin_multiplicity', '5th_closest_to_0_atomic_mass', '5th_closest_to_0_valence', '5th_closest_to_0_spin_multiplicity', '6th_closest_to_0_atomic_mass', '6th_closest_to_0_valence', '6th_closest_to_0_spin_multiplicity', '7th_closest_to_0_atomic_mass', '7th_closest_to_0_valence', '7th_closest_to_0_spin_multiplicity', '8th_closest_to_0_atomic_mass', '8th_closest_to_0_valence', '8th_closest_to_0_spin_multiplicity', '9th_closest_to_0_atomic_mass', '9th_closest_to_0_valence', '9th_closest_to_0_

In [15]:
results

{'base_score': 0.11624966229611856,
 'atom1_valence': 0.13908507750474156,
 'distance': 0.30409684102762097,
 'distance_closest_to_0': 0.1314930921257668,
 'distance_2nd_closest_to_0': 0.2035061385958581,
 'distance_3rd_closest_to_0': 0.13593251497072856,
 'distance_4th_closest_to_0': 0.13592902237180302,
 'distance_5th_closest_to_0': 0.12418844171562442,
 'distance_6th_closest_to_0': 0.12048359490026912,
 'distance_7th_closest_to_0': 0.1197150686224358,
 'distance_8th_closest_to_0': 0.11867837399979411,
 'distance_9th_closest_to_0': 0.11976977457551821,
 'distance_10th_closest_to_0': 0.11788356951574526,
 'distance_closest_to_1': 0.12886166641921612,
 'distance_2nd_closest_to_1': 0.1268584643965523,
 'distance_3rd_closest_to_1': 0.2013288641931263,
 'distance_4th_closest_to_1': 0.1276491289447391,
 'distance_5th_closest_to_1': 0.137498661570282,
 'distance_6th_closest_to_1': 0.12151785667805233,
 'distance_7th_closest_to_1': 0.12805490765777391,
 'distance_8th_closest_to_1': 0.1209469

In [None]:
types = ['2JHC', '3JHC', '2JHH', '2JHN', '1JHN', '3JHH', '1JHC']
# results_dict = {}
# bad_features_dict = {}
for t in types:
    train = pd.read_parquet(f'../data/FE016/FE016-train-{t}.parquet')
    x = pd.read_parquet('../data/FE008_train.parquet') # only loading for skeleton not features
    model = catboost.CatBoostRegressor()
    model.load_model('../models/M040/M040-0708_1639-{}-{}.model'.format(t, 1))
    mol_group = x[['molecule_name','type']].copy()
    N_FOLDS = 2
    folds = GroupKFold(n_splits=N_FOLDS)
    X = train[model.feature_names_]
    y = train['scalar_coupling_constant']
    mol_group_type = mol_group.loc[mol_group['type'] == t]['molecule_name']
    for fold_n, (train_idx, valid_idx) in enumerate(folds.split(X, groups=mol_group_type)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        X_valid = X_valid.copy()
        results, bad_features = permutation_importance(model, X_valid, y_valid, metric=mean_absolute_error)
        results_dict[t] = results
        bad_features_dict[t] = bad_features
        break # Only run for the first fold
    print(f'Bad features for {t}:')
    print(bad_features)
    del train, x, X, y, mol_group_type
    gc.collect()

  0%|          | 0/220 [00:00<?, ?it/s]

Base score 0.2185


  0%|          | 1/220 [01:21<4:56:49, 81.32s/it]

column: atom1_valence - 0.72811


  1%|          | 2/220 [02:43<4:56:14, 81.53s/it]

column: left_middle_average_angle - 0.2488


  1%|▏         | 3/220 [04:06<4:56:25, 81.96s/it]

column: right_middle_average_angle - 0.28633


  2%|▏         | 4/220 [05:26<4:53:12, 81.45s/it]

column: distance - 0.37925


  2%|▏         | 5/220 [06:45<4:49:39, 80.84s/it]

column: distance_closest_to_0 - 0.27509


  3%|▎         | 6/220 [08:05<4:46:59, 80.47s/it]

column: distance_2nd_closest_to_0 - 0.3058


  3%|▎         | 7/220 [09:23<4:43:19, 79.81s/it]

column: distance_3rd_closest_to_0 - 0.283


  4%|▎         | 8/220 [10:40<4:38:17, 78.76s/it]

column: distance_4th_closest_to_0 - 0.26887


  4%|▍         | 9/220 [12:00<4:38:22, 79.16s/it]

column: distance_5th_closest_to_0 - 0.24611


  5%|▍         | 10/220 [13:21<4:38:55, 79.69s/it]

column: distance_6th_closest_to_0 - 0.23454


  5%|▌         | 11/220 [14:36<4:33:27, 78.50s/it]

column: distance_7th_closest_to_0 - 0.23964


  5%|▌         | 12/220 [15:55<4:31:54, 78.44s/it]

column: distance_8th_closest_to_0 - 0.23404


  6%|▌         | 13/220 [17:19<4:36:34, 80.17s/it]

column: distance_9th_closest_to_0 - 0.22677


  6%|▋         | 14/220 [18:38<4:33:52, 79.77s/it]

column: distance_10th_closest_to_0 - 0.22592


  7%|▋         | 15/220 [19:58<4:33:11, 79.96s/it]

column: distance_closest_to_1 - 0.27569


  7%|▋         | 16/220 [21:19<4:32:37, 80.18s/it]

column: distance_2nd_closest_to_1 - 0.30652


  8%|▊         | 17/220 [22:37<4:29:28, 79.65s/it]

column: distance_3rd_closest_to_1 - 0.26993


  8%|▊         | 18/220 [23:57<4:28:39, 79.80s/it]

column: distance_4th_closest_to_1 - 0.2877


  9%|▊         | 19/220 [25:15<4:25:18, 79.20s/it]

column: distance_5th_closest_to_1 - 0.24876


  9%|▉         | 20/220 [26:37<4:27:00, 80.10s/it]

column: distance_6th_closest_to_1 - 0.23154


 10%|▉         | 21/220 [28:01<4:29:29, 81.25s/it]

column: distance_7th_closest_to_1 - 0.22864


 10%|█         | 22/220 [29:23<4:28:41, 81.42s/it]

column: distance_8th_closest_to_1 - 0.23046


 10%|█         | 23/220 [30:43<4:25:46, 80.95s/it]

column: distance_9th_closest_to_1 - 0.22797


 11%|█         | 24/220 [32:05<4:25:28, 81.27s/it]

column: distance_10th_closest_to_1 - 0.22939


 11%|█▏        | 25/220 [33:25<4:22:29, 80.77s/it]

column: closest_to_0_atomic_mass - 0.2185


 12%|█▏        | 26/220 [34:45<4:20:27, 80.55s/it]

column: closest_to_0_valence - 0.79999


 12%|█▏        | 27/220 [36:03<4:16:55, 79.87s/it]

column: closest_to_0_spin_multiplicity - 0.22107


 13%|█▎        | 28/220 [37:21<4:13:30, 79.22s/it]

column: 2nd_closest_to_0_atomic_mass - 0.39773


 13%|█▎        | 29/220 [38:42<4:13:56, 79.77s/it]

column: 2nd_closest_to_0_valence - 0.91053


 14%|█▎        | 30/220 [40:03<4:13:39, 80.10s/it]

column: 2nd_closest_to_0_spin_multiplicity - 0.21912


 14%|█▍        | 31/220 [41:22<4:11:37, 79.88s/it]

column: 3rd_closest_to_0_atomic_mass - 0.23961


 15%|█▍        | 32/220 [42:41<4:09:54, 79.76s/it]

column: 3rd_closest_to_0_valence - 0.26071


 15%|█▌        | 33/220 [44:05<4:12:23, 80.98s/it]

column: 3rd_closest_to_0_spin_multiplicity - 0.21921


 15%|█▌        | 34/220 [45:24<4:08:54, 80.29s/it]

column: 4th_closest_to_0_atomic_mass - 0.22221


 16%|█▌        | 35/220 [46:43<4:06:34, 79.97s/it]

column: 4th_closest_to_0_valence - 0.23851


 16%|█▋        | 36/220 [48:04<4:06:19, 80.33s/it]

column: 4th_closest_to_0_spin_multiplicity - 0.22143


 17%|█▋        | 37/220 [49:25<4:05:44, 80.57s/it]

column: 5th_closest_to_0_atomic_mass - 0.2394


 17%|█▋        | 38/220 [50:47<4:05:38, 80.98s/it]

column: 5th_closest_to_0_valence - 0.22211


 18%|█▊        | 39/220 [52:07<4:03:08, 80.60s/it]

column: 5th_closest_to_0_spin_multiplicity - 0.21858


 18%|█▊        | 40/220 [53:28<4:02:22, 80.79s/it]

column: 6th_closest_to_0_atomic_mass - 0.22826


 19%|█▊        | 41/220 [54:51<4:02:17, 81.21s/it]

column: 6th_closest_to_0_valence - 0.23448


 19%|█▉        | 42/220 [56:13<4:02:09, 81.62s/it]

column: 6th_closest_to_0_spin_multiplicity - 0.21866


 20%|█▉        | 43/220 [57:38<4:03:56, 82.69s/it]

column: 7th_closest_to_0_atomic_mass - 0.22704


 20%|██        | 44/220 [58:57<3:58:38, 81.35s/it]

column: 7th_closest_to_0_valence - 0.22634


 20%|██        | 45/220 [1:00:17<3:56:54, 81.22s/it]

column: 7th_closest_to_0_spin_multiplicity - 0.21865


 21%|██        | 46/220 [1:01:41<3:57:25, 81.87s/it]

column: 8th_closest_to_0_atomic_mass - 0.22245


 21%|██▏       | 47/220 [1:03:03<3:56:35, 82.06s/it]

column: 8th_closest_to_0_valence - 0.22179


 22%|██▏       | 48/220 [1:04:25<3:54:46, 81.90s/it]

column: 8th_closest_to_0_spin_multiplicity - 0.21871


 22%|██▏       | 49/220 [1:05:45<3:52:17, 81.51s/it]

column: 9th_closest_to_0_atomic_mass - 0.22618


 23%|██▎       | 50/220 [1:07:09<3:53:03, 82.26s/it]

column: 9th_closest_to_0_valence - 0.22734


 23%|██▎       | 51/220 [1:08:30<3:49:50, 81.60s/it]

column: 9th_closest_to_0_spin_multiplicity - 0.21866


 24%|██▎       | 52/220 [1:09:52<3:48:52, 81.74s/it]

column: 10th_closest_to_0_atomic_mass - 0.21942


 24%|██▍       | 53/220 [1:11:12<3:46:02, 81.21s/it]

column: 10th_closest_to_0_valence - 0.22007


 25%|██▍       | 54/220 [1:12:34<3:45:32, 81.52s/it]

column: 10th_closest_to_0_spin_multiplicity - 0.21869
