# Global and Local Explanation for KNN model on MIT data


In [1]:
import sys
import os
data_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install matplotlib -q

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your own google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'

Path = dict({
    'mit_test': data_path +  'mitbih_test_clean.csv',
    'mit_train':  data_path + 'mitbih_train_clean.csv',
})

Running on local environment
Current working directory: g:\Meine Ablage\heartbeat-analysis-ai\notebooks


In [2]:
# Verify installation and import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.manifold import TSNE, Isomap
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import pickle

In [3]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df

def getBarChartFromCategoryValueCounts(category_value_counts):
    """
    We call the plot over the pandas series object to plot the category count values
    """
    plt.figure(figsize=(10, 6))
    bar_chart = category_value_counts.plot(kind='bar')
    plt.xlabel('Categories')
    plt.ylabel('Count')
    plt.legend()
    plt.grid(False)
    plt.xticks(rotation=360)
    for i in bar_chart.containers:
        bar_chart.bar_label(i, label_type='edge')
    plt.show()


def showTop10DataInChart(df):
    plt.figure(figsize=(10, 6))
    xDataAxis = list(range(0, df.shape[1]))
    yDataRows = list(df.values[1: 10])
    for y in yDataRows:
        plt.plot(xDataAxis, y)
    plt.show()

In [13]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from xgboost import XGBModel, Booster
import os

# Load the data
mit_train = pd.read_csv(Path['mit_train'])
mit_test = pd.read_csv(Path['mit_test'])

# Remove duplicates
mit_train = mit_train.drop_duplicates()
mit_test = mit_test.drop_duplicates()

# Prepare the data
mit_train = addColumnsToDataframe(mit_train)
mit_test = addColumnsToDataframe(mit_test)
mit_train = convertColumnAsInt(mit_train, 'target')
mit_test = convertColumnAsInt(mit_test, 'target')

# Split data into features and target
X_train = mit_train.drop(columns=['target'])
y_train = mit_train['target']
X_test = mit_test.drop(columns=['target'])
y_test = mit_test['target']

# Define the model path
current_dir = os.getcwd()
model_path = os.path.join(current_dir, '..', 'models', 'model_knn_mit.pkl')


load models

In [14]:
# Load the XGBoost model (wrapped inside a pipeline)
with open(model_path, 'rb') as file:
    pipeline_model = pickle.load(file)

# Check the type of the model (pipeline)
print(f"Model type: {type(pipeline_model)}")

Model type: <class 'imblearn.pipeline.Pipeline'>


In [18]:
# Look at the steps of the pipeline
print("Pipeline steps:", pipeline_model.named_steps)
    
# Extract the KNN model from the pipeline
knn_model = pipeline_model.named_steps['model']

# Verify the model type
print("Extracted model type:", type(knn_model))

Pipeline steps: {'scaler': MinMaxScaler(), 'oversampling': RandomOverSampler(), 'model': KNeighborsClassifier(weights='distance')}
Extracted model type: <class 'sklearn.neighbors._classification.KNeighborsClassifier'>


## Feature Importance

KNN is an instance-based, non-parametric algorithm that makes predictions based on the distances to nearby points, and it doesn’t assign weights or importance values to features inherently. Hence, KNN does not have a built-in way of determining feature importance. 

## SHAP (does not work yet)

In [None]:
import shap
import matplotlib.pyplot as plt

# Initialize SHAP KernelExplainer with the KNN model and a sample of the training data
explainer = shap.KernelExplainer(knn_model.predict, X_train.sample(100, random_state=42))

# Increase the number of samples in X_test_sample to avoid the ValueError
# The number of samples should be at least equal to the number of features to avoid convergence issues
num_features = X_train.shape[1]
X_test_sample = X_test.sample(max(num_features, 50), random_state=42)  # Ensure enough rows for stability

# Calculate SHAP values for the same sample of the scaled test data
shap_values = explainer.shap_values(X_test_sample)

# Visualize SHAP values with a summary plot for the same test sample
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test_sample, plot_type="bar") 
plt.title('SHAP Summary Plot for KNN Model')
plt.show()


  7%|▋         | 14/187 [08:29<1:45:14, 36.50s/it]

In [43]:
import shap
import matplotlib.pyplot as plt

explainer = shap.KernelExplainer(knn_model.predict, X_train.sample(100, random_state=42))

# Increase the number of samples in X_test_sample to avoid the ValueError
# The number of samples should be at least equal to the number of features to avoid convergence issues
num_features = X_train.shape[1]
X_test_sample = X_test.sample(max(num_features, 50), random_state=42)  # Ensure enough rows for stability

# Calculate SHAP values for the larger sample size
shap_values = explainer.shap_values(X_test)

# Visualize SHAP values with a summary plot for the test sample
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test_sample, plot_type="bar") 
plt.title('SHAP Summary Plot for KNN Model')
plt.show()


  0%|          | 2/20284 [01:42<289:24:40, 51.37s/it]


KeyboardInterrupt: 

LIME


In [29]:
import lime
import lime.lime_tabular

# Create a Lime Explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),  # Features from the training data
    training_labels=np.array(y_train),  # Target labels for training data
    mode="classification",  # Set to 'classification' or 'regression' based on your task
    feature_names=X_train.columns,  # Feature names
    class_names=['Class 0', 'Class 1'],  # Class names for binary classification (modify if more classes)
    discretize_continuous=True  # Discretizes continuous features for better explanation
)


In [31]:
# Select a random instance from the test set
idx = 25 # Change this index to select a different instance
instance = X_test.iloc[idx]  # The input instance

# Get the true label for the selected instance
true_label = y_test.iloc[idx]

print("True Label for selected instance:", true_label)
print("Instance features:\n", instance)


True Label for selected instance: 0
Instance features:
 c_0      0.977912
c_1      0.791165
c_2      0.020080
c_3      0.022088
c_4      0.002008
           ...   
c_182    0.000000
c_183    0.000000
c_184    0.000000
c_185    0.000000
c_186    0.000000
Name: 25, Length: 187, dtype: float64
