# Permutation Feature Importance

This code is inspired from the [notebook](https://www.kaggle.com/code/cdeotte/lstm-feature-importance) from chris Deotte. Here i tried to implement feature importance using permutation feature importance. Further reading [here](https://christophm.github.io/interpretable-ml-book/feature-importance.html#feature-importance).
This kernel performs feature importance to the kernel by [Hasan Basri Akçay](https://www.kaggle.com/hasanbasriakcay). Thanks for sharing this.

In [None]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import pandas as pd
import numpy as np
import warnings 
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

warnings.simplefilter("ignore")
train = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
test_pseudo = pd.read_csv("../input/tpsapr22-pseudo-labels/pseudo_labeled_test.csv")
train_labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")

In [None]:
th = 0.95
test_pseudo_selected = test_pseudo.loc[((test_pseudo['state_proba']>=th) | (test_pseudo['state_proba']<=(1 - th))), 
                                      ['sequence', 'state_proba']]
test_pseudo_selected.columns = ['sequence', 'state']
test_pseudo_selected['state'] = test_pseudo_selected['state'].round()

# Feature Engineering

In [None]:
def create_new_features(df):
    df['sensor_02_num'] = df['sensor_02'] > -15
    df['sensor_02_num'] = df['sensor_02_num'].astype(int)
    df['sensor_sum1'] = (df['sensor_00'] + df['sensor_09'] + df['sensor_06'] + df['sensor_01'])
    df['sensor_sum2'] = (df['sensor_01'] + df['sensor_11'] + df['sensor_09'] + df['sensor_06'] + df['sensor_00'])
    df['sensor_sum3'] = (df['sensor_03'] + df['sensor_11'] + df['sensor_07'])
    df['sensor_sum4'] = (df['sensor_04'] + df['sensor_10'])
    
    sensors = ['sensor_'+'%02d'%i for i in range(0, 13)]
    sensors.extend(['sensor_02_num', 'sensor_sum1', 'sensor_sum2', 'sensor_sum3', 'sensor_sum4'])
    
    for sensor in sensors:
        df[sensor + '_lag1'] = df.groupby('sequence')[sensor].shift(1)
        df.fillna(0, inplace=True)
        df[sensor + '_diff1'] = df[sensor] - df[sensor + '_lag1'] 
    
    return df

In [None]:
train = create_new_features(train)
test = create_new_features(test)

In [None]:
selected_test = test.loc[test['sequence'].isin(test_pseudo_selected['sequence']), :].copy()
selected_test = selected_test.merge(test_pseudo_selected[['sequence', 'state']], on='sequence', how='left')
selected_test.reset_index(inplace=True, drop=True)

In [None]:
train = pd.merge(train, train_labels, how='left', on="sequence")
train = pd.concat([train, selected_test])
groups = train['sequence']

In [None]:
Window = 60
y = train['state'].to_numpy().reshape(-1, Window)
train.drop(["sequence","step","subject","state"], axis=1, inplace=True)
test.drop(["sequence","step","subject"], axis=1, inplace=True)

In [None]:
COLS = list(train.columns)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(train)
X_train = sc.transform(train)
X_test = sc.transform(test)

In [None]:
X_train = X_train.reshape(-1, Window, X_train.shape[-1])
X_test = X_test.reshape(-1, Window, X_test.shape[-1])

In [None]:
y_train = y.copy()

# Modeling

In [None]:
from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.metrics import AUC

def get_model():
    input_layer = Input(shape=(X_train.shape[-2:]))
    x1 = Bidirectional(LSTM(768, return_sequences=True))(input_layer)
        
    x21 = Bidirectional(LSTM(512, return_sequences=True))(x1)
    x22 = Bidirectional(LSTM(512, return_sequences=True))(input_layer)
    l2 = Concatenate(axis=2)([x21, x22])
        
    x31 = Bidirectional(LSTM(384, return_sequences=True))(l2)
    x32 = Bidirectional(LSTM(384, return_sequences=True))(x21)
    l3 = Concatenate(axis=2)([x31, x32])
        
    x41 = Bidirectional(LSTM(256, return_sequences=True))(l3)
    x42 = Bidirectional(LSTM(128, return_sequences=True))(x32)
    l4 = Concatenate(axis=2)([x41, x42])
        
    l5 = Concatenate(axis=2)([x1, l2, l3, l4])
    x7 = Dense(128, activation='selu')(l5)
    x8 = Dropout(0.3)(x7)
    output_layer = Dense(units=1, activation="sigmoid")(x8)
    model = Model(inputs=input_layer, outputs=output_layer, name='DNN_Model')
    model.compile(optimizer="adam",loss="binary_crossentropy", metrics=[AUC(name = 'auc')])
    
    return model

In [None]:
model = get_model()

# Predictions

In [None]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import roc_auc_score
import gc

test_preds = []
auc = []
nfold = 10
ncols = 5
nrows = round(nfold / ncols)

col, row = 0, 0

TRAIN_MODEL = True
FEATURE_IMPORTANCE = True

kf = GroupKFold(n_splits=nfold)
for fold, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train, groups.unique())):
    
    if fold == 0:
        print(f"Fold: {fold+1}", end=' ')
        X_train_part, X_valid = X_train[train_idx], X_train[test_idx]
        y_train_part, y_valid = y_train[train_idx], y_train[test_idx]

        model = get_model()
        lr = ReduceLROnPlateau(monitor="val_auc", mode='max', factor=0.7, patience=4, verbose=False)
        es = EarlyStopping(monitor='val_auc',mode='max', patience=10, verbose=False,restore_best_weights=True)
        history = model.fit(X_train_part, y_train_part, validation_data=(X_valid, y_valid), epochs=30, batch_size=64, 
                            callbacks=[es,lr], verbose=1)

        y_pred = model.predict(X_valid).squeeze()
        auc_score = roc_auc_score(y_valid, y_pred)
        print(f'auc: {round(auc_score, 5)}')
        test_preds.append(model.predict(X_test).squeeze())
        auc.append(auc_score)


        col += 1
        if col >= ncols:
            row += 1
            col = 0

        if FEATURE_IMPORTANCE:

            results = []
            print(' Computing feature importance...')

            # COMPUTE BASELINE (NO SHUFFLE)
            oof_preds = model.predict(X_valid, verbose=0).squeeze() 
            baseline_auc = roc_auc_score(y_valid, oof_preds)
            results.append({'feature':'BASELINE','auc':baseline_auc})           

            for k in tqdm(range(len(COLS))):

                # SHUFFLE FEATURE K
                save_col = X_valid[:,:,k].copy()
                np.random.shuffle(X_valid[:,:,k])

                # COMPUTE OOF AUC WITH FEATURE K SHUFFLED
                oof_preds = model.predict(X_valid, verbose=0).squeeze() 
                auc = roc_auc_score(y_valid, oof_preds)
                results.append({'feature':COLS[k],'auc':auc})
                X_valid[:,:,k] = save_col

            # DISPLAY FEATURE IMPORTANCE
            print()
            df = pd.DataFrame(results)
            df = df.sort_values('auc')
            plt.figure(figsize=(10,20))
            plt.barh(np.arange(len(COLS)+1),df.auc)
            plt.yticks(np.arange(len(COLS)+1),df.feature.values)
            plt.title('Feature Importance',size=16)
            plt.ylim((-1,len(COLS)+1))
            plt.plot([baseline_auc,baseline_auc],[-1,len(COLS)+1], '--', color='orange',
                     label=f'Baseline OOF\nAUC={baseline_auc:.3f}')
            plt.xlabel(f'Fold {fold+1} OOF AUC with feature permuted',size=14)
            plt.ylabel('Feature',size=14)
            plt.legend()
            plt.show()

            # SAVE FEATURE IMPORTANCE
            df = df.sort_values('auc',ascending=False)
            df.to_csv(f'feature_importance_fold_{fold+1}.csv',index=False)  