# Prediction of bloodstream infection based on biochemical data


## Load data and libraries and set parameters

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn import preprocessing, ensemble
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

# Constants
SEED = 123
FOLDS = 5
VERBOSE = 0

# Load datasets
devset = pd.read_csv('devset.csv')
testset = pd.read_csv('testset.csv')

# Copy IDs for reference
devset_ID = devset['ID'].copy()
testset_ID = testset['ID'].copy()

# Drop unnecessary columns
columns_to_drop = ['ID', 'most_common_pathogens']
devset = devset.drop(columns=columns_to_drop)
testset = testset.drop(columns=columns_to_drop)


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# Create a dictionary with old column names as keys and new column names as values
column_names = {'PrevInfectionRate': 'PBCR', 'PrevAdmissionRate': 'CER', 'biochemical_abnormality_score': 'BVA', 'biochemical_abnormality_score_NAexcluded': 'NBVA', 'modified_biochemical_abnormality_score': 'SBVA'}

# Rename the columns using the rename() method
devset = devset.rename(columns=column_names)
testset = testset.rename(columns=column_names)

In [None]:
devset = devset.drop(columns='PBCR')
testset = testset.drop(columns='PBCR')

In [None]:
mydata = pd.concat([devset,testset],axis=0)

In [None]:
# load data_dictionary

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

mydata = pd.concat([devset,testset],axis=0)
# Convert 'TestDate' column to datetime format
mydata['TestDate'] = pd.to_datetime(mydata['TestDate'], errors='coerce')

# Drop rows with invalid datetime values
mydata.dropna(subset=['TestDate'], inplace=True)

# Extract numeric component from 'TestDate' column
numeric_days = (mydata['TestDate'] - mydata['TestDate'].min()).dt.total_seconds() / (24 * 60 * 60)  # Convert to days

# Determine the bin size automatically using the Freedman-Diaconis rule
n = len(numeric_days)
iqr = np.percentile(numeric_days, 75) - np.percentile(numeric_days, 25)
bin_size = 2.0 * iqr / (n ** (1/3))

# Create bins based on the determined bin size
min_date = mydata['TestDate'].min()
numeric_days = numeric_days - numeric_days.min()
mydata['time_bin'] = (np.floor(numeric_days / bin_size) * bin_size + numeric_days.min()).apply(lambda x: min_date + pd.Timedelta(days=x))

# Calculate correlations for each bin (using Spearman correlation)
# outcome_var = 'outcome'  # Replace 'outcome' with the actual name of your outcome variable
correlations = {}
for bin_start, group in mydata.groupby('time_bin'):
    correlations[bin_start] = group.corr(method='spearman')[outcome_var]

# Create a DataFrame with correlation coefficients for each variable
correlation_df = pd.DataFrame(correlations)

# Sort features by median correlation in descending order
median_correlations = correlation_df.median(axis=1)
sorted_features = median_correlations.sort_values(ascending=False).index
# Exclude the outcome variable from the sorted features
sorted_features = sorted_features[sorted_features != outcome_var]
correlation_df_sorted = correlation_df.loc[sorted_features]

translated_feature_names = [data_dictionary.get(feature, feature) for feature in correlation_df_sorted.index]
# Set font size for the plot
plt.rc('font', size=7)

# Create the heatmap
plt.figure(figsize=(10, 11))
ax = sns.heatmap(correlation_df_sorted, cmap='bwr', vmin=-1, vmax=1, annot=False, fmt=".2f")
plt.xlabel(f'Time Bins: {round(bin_size)} days')
plt.ylabel('Features')
plt.title(f'Spearman correlation of features with {outcome_var} over time')

# Set x-axis tick labels to show a maximum of 10 ticks
num_ticks = min(10, len(correlation_df.columns))
x_ticks_indices = np.linspace(0, len(correlation_df.columns) - 1, num_ticks).astype(int)
x_ticks_labels = [label.strftime('%Y-%m-%d') for label in correlation_df.columns[x_ticks_indices]]
plt.xticks(x_ticks_indices + 0.5, x_ticks_labels, rotation=45)
y_ticks_labels = [f"{translated_feature_names[i]}: {round(median_correlations[feature], 2)}" for i, feature in enumerate(sorted_features)]
plt.yticks(np.arange(len(correlation_df_sorted)) + 0.5, y_ticks_labels)

plt.tight_layout()

# Save the figure as a PNG file
plt.savefig('Spearman_correlation_overtime.png', dpi=300)

plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information for each bin
mutual_infos = {}
for bin_start, group in mydata.groupby('time_bin'):
    X = group.drop([outcome_var, 'time_bin'], axis=1).select_dtypes(include=[np.number])  # Numeric features only
    y = group[outcome_var]  # Outcome variable
    # Compute mutual information with class labels
    mi_scores = mutual_info_classif(X, y, discrete_features='auto', random_state=SEED)  # Use mutual_info_classif
    mutual_infos[bin_start] = mi_scores

# Create a DataFrame with mutual information scores for each variable
mi_df = pd.DataFrame(mutual_infos)

# Sort features by median mutual information in descending order
median_mi_scores = mi_df.median(axis=1)
sorted_features = median_mi_scores.sort_values(ascending=False).index
mi_df_sorted = mi_df.loc[sorted_features]

# Set font size for the plot
plt.rc('font', size=7)

translated_feature_names = [data_dictionary.get(feature, feature) for feature in X.columns[sorted_features]]

# Create the heatmap
plt.figure(figsize=(10, 11))
ax = sns.heatmap(mi_df_sorted, cmap='Reds', annot=False, fmt=".2f")
plt.xlabel(f'Time Bins: {round(bin_size)} days')
plt.ylabel('Features')
plt.title(f'Mutual Information of features with {outcome_var} over time')

# Set x-axis tick labels to show a maximum of 10 ticks
num_ticks = min(10, len(mi_df.columns))
x_ticks_indices = np.linspace(0, len(mi_df.columns) - 1, num_ticks).astype(int)
x_ticks_labels = [label.strftime('%Y-%m-%d') for label in mi_df.columns[x_ticks_indices]]
plt.xticks(x_ticks_indices + 0.5, x_ticks_labels, rotation=45)

# Set y-axis tick labels with translated feature names and median mutual information scores
y_ticks_labels = [f"{translated_feature_names[i]}: {round(median_mi_scores[feature], 2)}" for i, feature in enumerate(sorted_features)]
plt.yticks(np.arange(len(mi_df_sorted)) + 0.5, y_ticks_labels)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr

# Calculate association measure for each bin
association_measures = {}
for bin_start, group in mydata.groupby('time_bin'):
    X = group.drop([outcome_var, 'time_bin'], axis=1).select_dtypes(include=[np.number])  # Numeric features only
    y = group[outcome_var]  # Outcome variable
    
    # Compute association measure (Point-Biserial Correlation) for each feature
    association_scores = [pointbiserialr(X[feature], y)[0] for feature in X.columns]
    
    association_measures[bin_start] = association_scores

# Create a DataFrame with association measures for each variable
association_df = pd.DataFrame(association_measures)

# Sort features by median association measure in descending order
median_association_scores = association_df.median(axis=1)
sorted_features = median_association_scores.sort_values(ascending=False).index
association_df_sorted = association_df.loc[sorted_features]

# Set font size for the plot
plt.rc('font', size=7)

# Replace feature names with translated names from the data_dictionary

translated_feature_names = [data_dictionary.get(feature, feature) for feature in X.columns[sorted_features]]

# Create the heatmap
plt.figure(figsize=(10, 11))
ax = sns.heatmap(correlation_df_sorted, cmap='bwr', vmin=-1, vmax=1, annot=False, fmt=".2f")
plt.xlabel(f'Time Bins: {round(bin_size)} days')
plt.ylabel('Features')
plt.title(f'Point-Biserial correlation of the features with {outcome_var} over time')

# Set x-axis tick labels to show a maximum of 10 ticks
num_ticks = min(10, len(association_df.columns))
x_ticks_indices = np.linspace(0, len(association_df.columns) - 1, num_ticks).astype(int)
x_ticks_labels = [label.strftime('%Y-%m-%d') for label in association_df.columns[x_ticks_indices]]
plt.xticks(x_ticks_indices + 0.5, x_ticks_labels, rotation=45)

# Set y-axis tick labels with translated feature names and median association scores
y_ticks_labels = [f"{translated_feature_names[i]}: {round(median_association_scores[feature], 2)}" for i, feature in enumerate(sorted_features)]
plt.yticks(np.arange(len(association_df_sorted)) + 0.5, y_ticks_labels)

plt.tight_layout()
plt.show()


In [None]:
devset = devset.drop(columns=['TestDate']) # 
testset = testset.drop(columns=['TestDate']) # 

In [None]:
# remove features and columns that should not be used in this model
cat_features = ["Sex"]
devset[cat_features] = devset[cat_features].astype('category')
testset[cat_features] = testset[cat_features].astype('category')

for feature in cat_features:
    devset[feature] = devset[feature].astype('str')
    testset[feature] = testset[feature].astype('str')
devset[cat_features] = devset[cat_features].astype('category')
testset[cat_features] = testset[cat_features].astype('category')


for feature in cat_features:
    categories = pd.Categorical(devset[feature])
    print(f"Categories of {feature}: {categories.categories}")

In [None]:
import pandas as pd
import numpy as np

# Replacing -1 with NaN in devset
devset_withmissing = devset.replace(-1, np.nan)

# Replacing -1 with NaN in testset
testset_withmissing = testset.replace(-1, np.nan)


In [None]:
import pandas as pd
import numpy as np

# Calculate the ratio in devset
devset_withmissing['NEUTRO_to_LYMFO'] = np.where(devset_withmissing['LYMFO'] != 0, devset_withmissing['NEUTRO'] / devset_withmissing['LYMFO'], np.nan)
devset_withmissing['Platelet-to-lymphocyte'] = np.where(devset_withmissing['LYMFO'] != 0, devset_withmissing['THROM'] / devset_withmissing['LYMFO'], np.nan)
# Calculate the ratio in testset
testset_withmissing['NEUTRO_to_LYMFO'] = np.where(testset_withmissing['LYMFO'] != 0, testset_withmissing['NEUTRO'] / testset_withmissing['LYMFO'], np.nan)
testset_withmissing['Platelet-to-lymphocyte'] = np.where(testset_withmissing['LYMFO'] != 0, testset_withmissing['THROM'] / testset_withmissing['LYMFO'], np.nan)

In [None]:
all_data_withmissing = pd.concat([devset_withmissing,testset_withmissing], axis = 0)

# Remove rows with missing values
df_cleaned = all_data_withmissing.dropna(subset=['NEUTRO_to_LYMFO'])

# Calculate median
median = np.median(df_cleaned['NEUTRO_to_LYMFO']).round(decimals=2)

# Calculate quartiles
Q1 = np.percentile(df_cleaned['NEUTRO_to_LYMFO'], 25).round(decimals=2)
Q3 = np.percentile(df_cleaned['NEUTRO_to_LYMFO'], 75).round(decimals=2)

print("Median:", median)
print("First Quartile (Q1):", Q1)
print("Third Quartile (Q3):", Q3)

# Calculate the percentage of missing values
missing_percentage = all_data_withmissing['NEUTRO_to_LYMFO'].isnull().mean() * 100

print("Percentage of missing values:", missing_percentage.round(decimals=2))
print("number of missing values:", all_data_withmissing['NEUTRO_to_LYMFO'].isnull().sum().round(decimals=2))

In [None]:
all_data_withmissing = pd.concat([devset_withmissing,testset_withmissing], axis = 0)

# Remove rows with missing values
df_cleaned = all_data_withmissing.dropna(subset=['Platelet-to-lymphocyte'])

# Calculate median
median = np.median(df_cleaned['Platelet-to-lymphocyte']).round(decimals=2)

# Calculate quartiles
Q1 = np.percentile(df_cleaned['Platelet-to-lymphocyte'], 25).round(decimals=2)
Q3 = np.percentile(df_cleaned['Platelet-to-lymphocyte'], 75).round(decimals=2)

print("Median:", median)
print("First Quartile (Q1):", Q1)
print("Third Quartile (Q3):", Q3)

# Calculate the percentage of missing values
missing_percentage = all_data_withmissing['Platelet-to-lymphocyte'].isnull().mean() * 100

print("Percentage of missing values:", missing_percentage.round(decimals=2))
print("number of missing values:", all_data_withmissing['Platelet-to-lymphocyte'].isnull().sum().round(decimals=2))

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Separate features and target variable for the training set
X_train_withmissing = devset_withmissing.drop(outcome_var, axis=1)
y_train = devset_withmissing[outcome_var]

# Separate features and target variable for the test set
X_test_withmissing = testset_withmissing.drop(outcome_var, axis=1)
y_test = testset_withmissing[outcome_var]

# Create an instance of the IterativeImputer
mice_imputer = IterativeImputer(max_iter=10, random_state=SEED)  # You can adjust the number of iterations as needed

# Fit and transform X_train_withmissing with MICE imputation
X_train_imputed = mice_imputer.fit_transform(X_train_withmissing)

# Convert the imputed array back to a DataFrame with column names
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train_withmissing.columns)

# Combine X_train_imputed and y_train into a single DataFrame
devset_imputed = pd.concat([X_train_imputed, y_train], axis=1)

# Transform X_test_withmissing using the same MICE imputer
X_test_imputed = mice_imputer.transform(X_test_withmissing)

# Convert the imputed array back to a DataFrame with column names
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test_withmissing.columns)

# Combine X_test_imputed and y_test into a single DataFrame
testset_imputed = pd.concat([X_test_imputed, y_test], axis=1)


#### feature selection (optional)

uncomment the following blocks if you want to do feature selection.

#### end of feature selection (optional)

In [None]:
# Save devset_withmissing as CSV
devset_withmissing.to_csv('devset_withmissing.csv', index=False)

# Save testset_withmissing as CSV
testset_withmissing.to_csv('testset_withmissing.csv', index=False)

In [None]:
devset_imputed.dtypes

In [None]:
# remove features and columns that should not be used in this model
cat_features = ["Sex"]
devset_imputed[cat_features] = devset_imputed[cat_features].astype('category')
testset_imputed[cat_features] = testset_imputed[cat_features].astype('category')

In [None]:
devset_imputed.dtypes

In [None]:
devset_withmissing.isnull().sum()

the only variable with missing values was prior_tx where the missing values are now replaced by 0

### data overview

### check for missingness

In [None]:
devset_imputed.isnull().sum()

In [None]:
devset_imputed.shape

### Use data dictionary to have a description of the variables in results

In [None]:
# Get the column names from the DataFrame
data_columns = devset_imputed.columns

# Find keys in data dictionary but not in data columns
keys_not_in_columns = set(data_dictionary.keys()) - set(data_columns)

# Print the keys that are not in the data columns
for key in keys_not_in_columns:
    print(key)


In [None]:
# Remove keys from data dictionary
data_dictionary = {key: value for key, value in data_dictionary.items() if key not in keys_not_in_columns}


In [None]:
# Find columns in mydata that are not in data dictionary keys
columns_not_in_keys = set(data_columns) - set(data_dictionary.keys())

# Print the columns that are not in the data dictionary keys
for column in columns_not_in_keys:
    print(column)

In [None]:
# mydata = pd.concat(devset_withmissing, testset_withmissing)

mydata = pd.concat([devset_withmissing, testset_withmissing]) # , ignore_index=True

In [None]:
missing_values = mydata.isnull().sum()

# Step 2: Divide by the total number of rows
total_rows = len(mydata)
missing_percentage = (missing_values / total_rows) * 100

# Step 3: Round the percentages to two decimal points
missing_percentage = missing_percentage.round(2)

# Step 4: Sort the percentages in ascending order
missing_percentage = missing_percentage.sort_values(ascending=False)

# Step 5: Calculate the mean and standard deviation of the missingness
mean_missingness = np.mean(missing_percentage)
std_missingness = np.std(missing_percentage)

# Step 6: Display the missing percentages, mean, and standard deviation
print("Missing Value Percentages:")
print(missing_percentage)
print("Mean ± Standard Deviation of Missingness: {:.2f} ± {:.2f}".format(mean_missingness, std_missingness))

In [None]:
mydata[outcome_var].unique()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Select continuous variables from the DataFrame
continuous_vars = mydata.select_dtypes(include=['float64', 'int64'])
categorical_vars = mydata.select_dtypes(include=['category'])

mydata[outcome_var] = mydata[outcome_var].map({0: 'negative', 1: 'positive'}).astype('category')

outcome_variable = mydata[outcome_var].copy()

# Calculate the number of rows and columns for subplots
num_continuous_vars = len(continuous_vars.columns)
num_categorical_vars = len(categorical_vars.columns)
num_cols_to_plot = 5
num_rows = (num_continuous_vars + num_categorical_vars + num_cols_to_plot - 1) // num_cols_to_plot + 1  # Adjust the number of rows based on the number of variables

# Create subplots for continuous variables
fig, axes = plt.subplots(num_rows, num_cols_to_plot, figsize=(12, num_rows * 2))  # Adjust the figsize as desired

# Iterate over continuous variables
for i, column in enumerate(continuous_vars.columns):
    # Determine the subplot indices
    row_idx = i // num_cols_to_plot
    col_idx = i % num_cols_to_plot

    # Check if subplot index is within the bounds of axes
    if row_idx < num_rows:
        # Get the axis for the current subplot
        ax = axes[row_idx, col_idx]

        # Iterate over each outcome category
        for outcome_category, ax_offset in zip(outcome_variable.unique(), [-0.2, 0.2]):
            # Filter the data for the current outcome category
            filtered_data = continuous_vars[outcome_variable == outcome_category][column]

            # Create a box plot for the current outcome category
            positions = np.array([1 + ax_offset])
            ax.boxplot(filtered_data.dropna(), positions=positions, widths=0.3, vert=False)  # Vert=False for horizontal box plots

        ax.set_title(f'{column}', fontsize=8)
        ax.set_yticks([1 - ax_offset, 1 + ax_offset])
        ax.set_yticklabels(outcome_variable.unique(), fontsize=8)
        ax.tick_params(axis='both', labelsize=8)
        ax.legend(fontsize=6)

# Iterate over categorical variables
for i, column in enumerate(categorical_vars.columns):
    # Determine the subplot indices
    row_idx = (i + num_continuous_vars) // num_cols_to_plot
    col_idx = (i + num_continuous_vars) % num_cols_to_plot

    # Check if subplot index is within the bounds of axes
    if row_idx < num_rows:
        # Get the axis for the current subplot
        ax = axes[row_idx, col_idx]

        # Normalize the counts for the current categorical variable stratified by outcome variable
        category_counts = categorical_vars.groupby(outcome_variable)[column].value_counts(normalize=True).unstack()
        category_counts.plot(kind='barh', ax=ax)

        # Set the title with the feature name
        ax.set_title(f'{column}', fontsize=8)

        ax.set_ylabel(None)
        ax.tick_params(axis='both', labelsize=8)
        ax.legend(fontsize=6)

# Remove any empty subplots at the end
if num_continuous_vars + num_categorical_vars < num_rows * num_cols_to_plot:
    for i in range(num_continuous_vars + num_categorical_vars, num_rows * num_cols_to_plot):
        fig.delaxes(axes.flatten()[i])

# Remove the subplot for outcome_var at the end
if num_continuous_vars + num_categorical_vars == num_rows * num_cols_to_plot - 1:
    last_ax_index = num_continuous_vars + num_categorical_vars - 1
    if last_ax_index >= 0:
        fig.delaxes(axes.flatten()[last_ax_index])

# Adjust the layout and spacing
plt.tight_layout()

# Save the figure as a PNG file
plt.savefig('feature_distributions.png', dpi=300)

# Show the plot
plt.show()


### summary statistics of the data

### split the data to a development set for finding the best model and a test set for the validation of the model

In [None]:
# Split mydata into training and test sets
# devset, testset = train_test_split(mydata, test_size=0.2, random_state=SEED, stratify=mydata[outcome_var])

In [None]:
devset_withmissing.head()

### associations of the predictors and the outcome variable based on Spearman's correlation (in only the development set)

## Initiating a QLattice model

In [None]:
import feyn
ql = feyn.QLattice(random_seed=SEED)

In [None]:
devset_imputed.loc[:, outcome_var] = devset_imputed[outcome_var].replace({1: True, 0: False}).astype(bool)
testset_imputed.loc[:, outcome_var] = testset_imputed[outcome_var].replace({1: True, 0: False}).astype(bool)

In [None]:
devset_imputed.dtypes

In [None]:
import pandas as pd

# create an empty dictionary to store the stypes
stypes = {}

# iterate over each column in the dataset
for col in devset_imputed.columns:
    # check if the column dtype is 'category'
    if pd.api.types.is_categorical_dtype(devset_imputed[col]):
        # if it is, add the column name to the stypes dictionary with a value of 'c'
        stypes[col] = 'c'
#     else:
#         if pd.api.types.is_numeric_dtype(devset[col]):
#             stypes[col] = 'f'

stypes[outcome_var] = 'b'
# print the stypes dictionary
print(stypes)



In [None]:
devset_imputed[outcome_var]

### set model weights based on class balance from the development set

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=devset_imputed[outcome_var])

### model development

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score
import pandas as pd

from sklearn.utils.class_weight import compute_sample_weight

# Define the number of folds for stratified k-fold
n_splits = 5

# Define the metrics to evaluate
metrics = ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC', 'AUC']

# Create an empty dictionary to store the results
results = {metric: [] for metric in metrics}

# Initialize stratified k-fold
skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)

y_train = devset_imputed[outcome_var]

fold_results_df_fold_table_QLattice = pd.DataFrame()

# Convert categorical columns to strings
devset_imputed[cat_features] = devset_imputed[cat_features].astype(str)
testset_imputed[cat_features] = testset_imputed[cat_features].astype(str)
# Perform stratified k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(devset_imputed, devset_imputed[outcome_var]), 1):
    print(f"Fold {fold}:")

    # Get the training and test data indices for this fold
    X_train_indices, X_test_indices = train_index.tolist(), test_index.tolist()

        # Get the training and test data for this fold
    X_train_fold = devset_imputed.iloc[X_train_indices, :]
    X_test_fold = devset_imputed.iloc[X_test_indices, :]
    y_train_fold = y_train.iloc[X_train_indices]
    y_test_fold = y_train.iloc[X_test_indices]
    sample_weights_fold = sample_weights[X_train_indices]
    # sample_weights_fold = compute_sample_weight(class_weight='balanced', y=devset_imputed[outcome_var])
   
    # Initialize qLattice
    ql = feyn.QLattice(random_seed=SEED)
    # Train the qLattice model
    models = ql.auto_run(
        data=X_train_fold,
        output_name=outcome_var,
        kind='classification',
        stypes=stypes,  # Include the stypes parameter
        n_epochs=50,
        criterion="bic",  # None or "bic" # BIC is more conservative than AIC
        loss_function='binary_cross_entropy',
        max_complexity=10,
        sample_weights=sample_weights_fold
    )

    best_model = models[0]

    # Get predictions for the test set
    predictions = best_model.predict(X_test_fold)
    predictions_class = [True if x >= 0.5 else False for x in predictions]


    # Calculate the confusion matrix
    cm = confusion_matrix(y_test_fold, predictions_class)

    # Extract true positive, true negative, false positive, and false negative counts
    tn, fp, fn, tp = cm.ravel()

    # Calculate PPV, NPV, sensitivity, and specificity
    PPV = tp / (tp + fp)
    NPV = tn / (tn + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    MCC = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    auc = roc_auc_score(y_test_fold, predictions)

    # Store the results for this fold
    fold_results = [PPV, NPV, sensitivity, specificity, balanced_accuracy, MCC, auc]
    for metric, value in zip(metrics, fold_results):
        results[metric].append(value)

    # Print the results for this fold
    fold_results_df = pd.DataFrame(fold_results, index=metrics, columns=[fold])
    fold_results_df = fold_results_df.round(2)
    print(fold_results_df.to_string())
    # Append the results for this fold to fold_results_df_fold_table_LGBM
    fold_results_df_fold_table_QLattice = fold_results_df_fold_table_QLattice.append(fold_results_df.T)
    print("")

# Calculate the average and standard deviation across folds
results_df = pd.DataFrame(results)
aggregated_results = results_df.mean().round(2)
aggregated_results_sd = results_df.std().round(2)
aggregated_results_formatted_QLattice = aggregated_results.astype(str) + " ± " + aggregated_results_sd.astype(str)

# Print the aggregated results
print("Aggregated Results:")
print(aggregated_results_formatted_QLattice.to_string())


### model development using the whole development set

In [None]:
import feyn
ql = feyn.QLattice(random_seed=SEED)
models = ql.auto_run(
    data=devset_imputed,
    output_name=outcome_var,
    kind='classification',
    stypes=stypes,
    n_epochs=50,
    criterion="bic", # None or "bic" # BIC is more conservative than AIC
    loss_function='binary_cross_entropy',
    max_complexity=10,
    sample_weights=sample_weights
    )

In [None]:
best_model = models[0]

## model performance based on ROC curve on the development set

In [None]:
best_model.plot_roc_curve(devset_imputed, threshold=0.5)

## model performance based on ROC curve on the test set

In [None]:
best_model.plot_roc_curve(testset_imputed, threshold=0.5)

## associations of the model variables

In [None]:
best_model.plot_signal(devset_imputed,corr_func='spearman')

In [None]:
best_model.plot_signal(testset_imputed,corr_func='spearman')

In [None]:
best_model.plot_signal(devset_imputed,corr_func='mutual_information')

In [None]:
best_model.plot_signal(testset_imputed,corr_func='mutual_information')

## model performance on the test set

In [None]:
from sklearn.metrics import confusion_matrix

# Get predictions for the dev set
predictions = best_model.predict(testset_imputed)
predictions_class = [True if x >= 0.5 else False for x in predictions]

# Calculate the confusion matrix
confusion_matrix = confusion_matrix(testset_imputed[outcome_var], predictions_class)

# Extract true positive, true negative, false positive, and false negative counts
tn, fp, fn, tp = confusion_matrix.ravel()

# Calculate PPV, NPV, sensitivity, and specificity
PPV_test = tp / (tp + fp)
NPV_test = tn / (tn + fn)
sensitivity_test = tp / (tp + fn)
specificity_test = tn / (tn + fp)
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2
MCC_test = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
auc = roc_auc_score(testset_imputed[outcome_var], predictions)

# Create a dictionary to hold the results
results = {
    'Metric': ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC',"AUC"],
    'Value': [PPV_test, NPV_test, sensitivity_test, specificity_test, balanced_accuracy_test, MCC_test,auc]
}
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)

print(results_df)
results_df_QLattice = results_df.copy()
# Save the results to an Excel file
results_df.to_excel('QLattice_results_test.xlsx', index=False)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Calculate the weight for each sample based on the number of samples per patient
patient_sample_weights = testset_ID.value_counts().apply(lambda x: 1 / x)

# Apply the weights to the performance metrics calculations
weighted_ppv = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fp * patient_sample_weights)
weighted_npv = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fn * patient_sample_weights)
weighted_sensitivity = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fn * patient_sample_weights)
weighted_specificity = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fp * patient_sample_weights)
weighted_balanced_accuracy = (weighted_sensitivity + weighted_specificity) / 2
weighted_mcc = np.sum((tp * tn - fp * fn) * patient_sample_weights) / np.sum(((tp * patient_sample_weights + fp * patient_sample_weights) * (tp * patient_sample_weights + fn * patient_sample_weights) * (tn * patient_sample_weights + fp * patient_sample_weights) * (tn * patient_sample_weights + fn * patient_sample_weights)) ** 0.5)

# Create a dictionary to hold the results
results = {
    'Metric': ['Weighted PPV', 'Weighted NPV', 'Weighted Sensitivity', 'Weighted Specificity', 'Weighted Balanced Accuracy', 'Weighted MCC'],
    'Value': [weighted_ppv, weighted_npv, weighted_sensitivity, weighted_specificity, weighted_balanced_accuracy, weighted_mcc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_QLattice_weighted = results_df.copy()
print(results_df)


## an overview of the model as a block diagram as well as model performance on the development vs test set

In [None]:
best_model.plot(devset_imputed, testset_imputed)

In [None]:
best_model.features

In [None]:
sel_feats_list = best_model.features
sel_feats_list.append(outcome_var)
print(sel_feats_list)


### distribution of model predicted probabilities for each class

In [None]:
# import matplotlib.pyplot as plt
best_model.plot_probability_scores(testset_imputed)


### model representation as a closed-form expression

In [None]:
sympy_model = best_model.sympify(symbolic_lr=True, include_weights=True)

sympy_model.as_expr()

### alternative models


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score

# Combine X_train_imputed and X_test_imputed into a single DataFrame
combined = pd.concat([X_train_imputed, X_test_imputed], axis=0)

# Loop through the columns of the combined DataFrame and check their data types
for col in combined.columns:
    if combined[col].dtype == 'object' or combined[col].dtype.name == 'category':
        # Perform one-hot encoding on the column
        combined = pd.get_dummies(combined, columns=[col], prefix=[col], drop_first=True)

# Split the combined DataFrame back into X_train_OHE and X_test_OHE
X_train_OHE = combined[:len(X_train_imputed)]
X_test_OHE = combined[len(X_train_imputed):]


In [None]:
X_train_OHE.head()

In [None]:
X_train_OHE.isnull().sum()

### stratified 5-fold cross validation 

Here we do cross validation to see how the model may perform on the test set.
This is done for each of the laternative models that is Random Forest, LightGBM, CATBoost, and Logistic Regression

#### Balanced Random Forest Classifier

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

# Define the number of folds for stratified k-fold
n_splits = 5

random_state = SEED
sampling_strategy = 'not majority'
replacement = True
n_estimators = 100  # Modify the number of trees in the forest
max_depth = 10      # Modify the maximum depth of each tree
min_samples_split = 2  # Modify the minimum number of samples required to split a node
min_samples_leaf = 1   # Modify the minimum number of samples required at each leaf node
criterion = "gini"
# Define the metrics to evaluate
metrics = ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC', 'AUC']

# Create an empty dictionary to store the results
results = {metric: [] for metric in metrics}

# Initialize stratified k-fold
skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)

fold_results_df_fold_table_BRF = pd.DataFrame()

# Perform stratified k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(X_train_OHE, y_train), 1):
    print(f"Fold {fold}:")

    # Get the training and test data indices for this fold
    X_train_indices, X_test_indices = train_index.tolist(), test_index.tolist()

        # Get the training and test data for this fold
    X_train_fold = X_train_OHE.iloc[X_train_indices, :]
    X_test_fold = X_train_OHE.iloc[X_test_indices, :]
    y_train_fold = y_train.iloc[X_train_indices]
    y_test_fold = y_train.iloc[X_test_indices]
    sample_weights_fold = sample_weights[X_train_indices]

    # Train random forest
    brf = BalancedRandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state,
    sampling_strategy=sampling_strategy,
    replacement=replacement,
    criterion = criterion
    )
    brf.fit(X_train_fold, y_train_fold, sample_weight=sample_weights_fold)

    # Get predictions
    brf_predictions = brf.predict_proba(X_test_fold)
    brf_predictions = brf_predictions[:, 1]
    brf_predictions_class = np.where(brf_predictions >= 0.5, True, False)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test_fold, brf_predictions_class)
    tn, fp, fn, tp = cm.ravel()

    # Calculate metrics
    PPV = tp / (tp + fp)
    NPV = tn / (tn + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    MCC = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    auc = roc_auc_score(y_test_fold, brf_predictions)

    # Store the results for this fold
    fold_results = [PPV, NPV, sensitivity, specificity, balanced_accuracy, MCC, auc]
    for metric, value in zip(metrics, fold_results):
        results[metric].append(value)

    # Print the results for this fold
    fold_results_df = pd.DataFrame(fold_results, index=metrics, columns=[fold])
    fold_results_df = fold_results_df.round(2)
    print(fold_results_df.to_string())
    fold_results_df_fold_table_BRF = fold_results_df_fold_table_BRF.append(fold_results_df.T)
    print("")

# Calculate the average and standard deviation across folds
results_df = pd.DataFrame(results)
aggregated_results = results_df.mean().round(2)
aggregated_results_sd = results_df.std().round(2)
ggregated_results_formatted_BRF = aggregated_results.astype(str) + " ± " + aggregated_results_sd.astype(str)

# Print the aggregated results
print("Aggregated Results:")
print(ggregated_results_formatted_BRF.to_string())



#### Histogram-based Gradient Boosting Classification Tree 

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Define the number of folds for stratified k-fold
n_splits = 5

# Define the metrics to evaluate
metrics = ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC', 'AUC']

# Create an empty dictionary to store the results
results = {metric: [] for metric in metrics}

# Initialize stratified k-fold
skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)

fold_results_df_fold_table_HGBC = pd.DataFrame()

# Perform stratified k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(X_train_OHE, y_train), 1):
    print(f"Fold {fold}:")

    # Get the training and test data indices for this fold
    X_train_indices, X_test_indices = train_index.tolist(), test_index.tolist()

        # Get the training and test data for this fold
    X_train_fold = X_train_OHE.iloc[X_train_indices, :]
    X_test_fold = X_train_OHE.iloc[X_test_indices, :]
    y_train_fold = y_train.iloc[X_train_indices]
    y_test_fold = y_train.iloc[X_test_indices]
    sample_weights_fold = sample_weights[X_train_indices]

    # Train random forest
    HGBC = HistGradientBoostingClassifier(random_state=SEED)
    HGBC.fit(X_train_fold, y_train_fold, sample_weight=sample_weights_fold)

    # Get predictions
    HGBC_predictions = HGBC.predict_proba(X_test_fold)
    HGBC_predictions = HGBC_predictions[:, 1]
    HGBC_predictions_class = np.where(HGBC_predictions >= 0.5, True, False)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test_fold, HGBC_predictions_class)
    tn, fp, fn, tp = cm.ravel()

    # Calculate metrics
    PPV = tp / (tp + fp)
    NPV = tn / (tn + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    MCC = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    auc = roc_auc_score(y_test_fold, HGBC_predictions)

    # Store the results for this fold
    fold_results = [PPV, NPV, sensitivity, specificity, balanced_accuracy, MCC, auc]
    for metric, value in zip(metrics, fold_results):
        results[metric].append(value)

    # Print the results for this fold
    fold_results_df = pd.DataFrame(fold_results, index=metrics, columns=[fold])
    fold_results_df = fold_results_df.round(2)
    print(fold_results_df.to_string())
    fold_results_df_fold_table_HGBC = fold_results_df_fold_table_HGBC.append(fold_results_df.T)
    print("")

# Calculate the average and standard deviation across folds
results_df = pd.DataFrame(results)
aggregated_results = results_df.mean().round(2)
aggregated_results_sd = results_df.std().round(2)
aggregated_results_formatted_HGBC = aggregated_results.astype(str) + " ± " + aggregated_results_sd.astype(str)

# Print the aggregated results
print("Aggregated Results:")
print(aggregated_results_formatted_HGBC.to_string())



#### LightGBM

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

# Define the list of metrics
metrics = ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC', 'AUC']

# Create an empty dictionary to store the results
results = {metric: [] for metric in metrics}

fold_results_df_fold_table_LGBM = pd.DataFrame()

# Perform stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(X_train_withmissing, y_train), 1):
    print(f"Fold {fold}:")

    # Get the training and test data indices for this fold
    X_train_indices, X_test_indices = train_index.tolist(), test_index.tolist()
    # Get the training and test data for this fold
    X_train_fold = X_train_withmissing.iloc[X_train_indices, :]
    X_test_fold = X_train_withmissing.iloc[X_test_indices, :]
    y_train_fold = y_train.iloc[X_train_indices]
    y_test_fold = y_train.iloc[X_test_indices]
    sample_weights_fold = sample_weights[X_train_indices]
    
    # Train light gbm
    lgbm = lgb.LGBMClassifier(random_state=SEED)
    lgbm.fit(X_train_fold, y_train_fold, sample_weight=sample_weights_fold)

    # Get predictions
    lgbm_predictions = lgbm.predict_proba(X_test_fold)
    lgbm_predictions = lgbm_predictions[:, 1]
    lgbm_predictions_class = np.where(lgbm_predictions >= 0.5, True, False)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test_fold, lgbm_predictions_class)
    tn, fp, fn, tp = cm.ravel()

    # Calculate metrics
    PPV = tp / (tp + fp)
    NPV = tn / (tn + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    MCC = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    auc = roc_auc_score(y_test_fold, lgbm_predictions)

    # Store the results for this fold
    fold_results = [PPV, NPV, sensitivity, specificity, balanced_accuracy, MCC, auc]
    for metric, value in zip(metrics, fold_results):
        results[metric].append(value)

    # Print the results for this fold
    fold_results_df = pd.DataFrame(fold_results, index=metrics, columns=[fold])
    fold_results_df = fold_results_df.round(2)
    print(fold_results_df.to_string())
    
    # Append the results for this fold to fold_results_df_fold_table_LGBM
    fold_results_df_fold_table_LGBM = fold_results_df_fold_table_LGBM.append(fold_results_df.T)
    
    print("")

# Calculate the average and standard deviation across folds
results_df = pd.DataFrame(results)
aggregated_results = results_df.mean().round(2)
aggregated_results_sd = results_df.std().round(2)
aggregated_results_formatted_LGBM = aggregated_results.astype(str) + " ± " + aggregated_results_sd.astype(str)

# Print the aggregated results
print("Aggregated Results:")
print(aggregated_results_formatted_LGBM.to_string())



#### CATBoost

In [None]:
# Create an empty dictionary to store the results
results = {metric: [] for metric in metrics}

fold_results_df_fold_table_CB = pd.DataFrame()
# Perform stratified k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(X_train_withmissing, y_train), 1):
    print(f"Fold {fold}:")

    # Get the training and test data indices for this fold
    X_train_indices, X_test_indices = train_index.tolist(), test_index.tolist()

    # Reset the index of y_train
    y_train_reset_index = y_train.reset_index(drop=True)

        # Get the training and test data for this fold
    X_train_fold = X_train_withmissing.iloc[X_train_indices, :]
    X_test_fold = X_train_withmissing.iloc[X_test_indices, :]
    y_train_fold = y_train.iloc[X_train_indices]
    y_test_fold = y_train.iloc[X_test_indices]
    sample_weights_fold = sample_weights[X_train_indices]

    # Train catboost
    catb = cb.CatBoostClassifier(random_state=SEED, cat_features=cat_features,iterations=500, verbose=False)
    catb.fit(X_train_fold, y_train_fold, sample_weight=sample_weights_fold)

    # Get predictions
    catb_predictions = catb.predict_proba(X_test_fold)
    catb_predictions = catb_predictions[:, 1]
    catb_predictions_class = np.where(catb_predictions >= 0.5, True, False)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test_fold, catb_predictions_class)
    tn, fp, fn, tp = cm.ravel()

    # Calculate metrics
    PPV = tp / (tp + fp)
    NPV = tn / (tn + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    MCC = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    auc = roc_auc_score(y_test_fold, catb_predictions)

    # Store the results for this fold
    fold_results = [PPV, NPV, sensitivity, specificity, balanced_accuracy, MCC, auc]
    for metric, value in zip(metrics, fold_results):
        results[metric].append(value)

    # Print the results for this fold
    fold_results_df = pd.DataFrame(fold_results, index=metrics, columns=[fold])
    fold_results_df = fold_results_df.round(2)
    print(fold_results_df.to_string())
    # Append the results
    fold_results_df_fold_table_CB = fold_results_df_fold_table_CB.append(fold_results_df.T)
    print("")

# Calculate the average and standard deviation across folds
results_df = pd.DataFrame(results)
aggregated_results = results_df.mean().round(2)
aggregated_results_sd = results_df.std().round(2)
aggregated_results_formatted_CB = aggregated_results.astype(str) + " ± " + aggregated_results_sd.astype(str)

# Print the aggregated results
print("Aggregated Results:")
print(aggregated_results_formatted_CB.to_string())



In [None]:
aggregated_results_formatted_all = pd.concat([aggregated_results_formatted_QLattice,
                                              ggregated_results_formatted_BRF,
                                             aggregated_results_formatted_HGBC,
                                             aggregated_results_formatted_LGBM,
                                             aggregated_results_formatted_CB],
                                             axis=1)
aggregated_results_formatted_all.columns = ["QLattice","BRF","HGBC","LGBM","CB"]
aggregated_results_formatted_all
print(aggregated_results_formatted_all)
# Save the results to an Excel file
aggregated_results_formatted_all.to_excel('aggregated_results_formatted_all.xlsx', index=False)

### statistical test to compare the performance of the models on cross validation

Using the Kruskal-Wallis test allows you to compare the mean AUC values of multiple models without relying on the assumptions of normality and homogeneity of variances. It provides a robust nonparametric approach to assess whether there are significant differences between the models in terms of their performance.

The Kruskal-Wallis test is a nonparametric equivalent of the ANOVA test and is suitable when the assumptions of normality and homogeneity of variances are not met.

Here's an outline of the steps to perform a Kruskal-Wallis test:

Null Hypothesis (H0): The mean AUC values of all models are equal.
Alternative Hypothesis (HA): At least one mean AUC value is significantly different from the others.

Collect the mean AUC values of each model obtained from cross-validation.

Perform a Kruskal-Wallis test, which tests for differences in the distribution of a continuous variable (AUC) among multiple groups (models).

Calculate the test statistic (H-statistic) and obtain the corresponding p-value.

Interpret the results:

If the p-value is less than a predetermined significance level (e.g., 0.05), reject the null hypothesis. It suggests that at least one model's mean AUC value is significantly different from the others.
If the p-value is greater than the significance level, fail to reject the null hypothesis. It indicates that there is no significant difference between the mean AUC values of the models.
If the null hypothesis is rejected (i.e., significant differences exist), you can perform post-hoc tests to determine which specific models are significantly different from each other. Common post-hoc tests for nonparametric data include the Dunn test or the Bonferroni correction.

In [None]:
import numpy as np
from scipy.stats import kruskal

# Extract AUC values from fold_results_df_fold_table_QLattice
model1_auc = fold_results_df_fold_table_LGBM['AUC'].values
model2_auc = fold_results_df_fold_table_CB['AUC'].values
model3_auc = fold_results_df_fold_table_HGBC['AUC'].values
model4_auc = fold_results_df_fold_table_QLattice['AUC'].values
model5_auc = fold_results_df_fold_table_BRF['AUC'].values

# Perform Kruskal-Wallis test
statistic, p_value = kruskal(model1_auc, model2_auc, model3_auc, model4_auc, model5_auc)

# Interpret the results
alpha = 0.05  # Significance level

if p_value < alpha:
    print("At least one model's mean AUC value is significantly different from the others.")
else:
    print("No significant difference between the mean AUC values of the models.")

print(f"Kruskal-Wallis test statistic: {statistic}")
print(f"P-value: {p_value}")


### Model training using the whole training set

In [None]:
# train random forest
# Define your desired hyperparameters
random_state = SEED
sampling_strategy = 'not majority'
replacement = True
n_estimators = 100  # Modify the number of trees in the forest
max_depth = 10      # Modify the maximum depth of each tree
min_samples_split = 2  # Modify the minimum number of samples required to split a node
min_samples_leaf = 1   # Modify the minimum number of samples required at each leaf node
criterion = "gini"

brf = BalancedRandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state,
    sampling_strategy=sampling_strategy,
    replacement=replacement,
    criterion = criterion
)

# Fit the classifier with the modified hyperparameters
brf.fit(X_train_OHE, y_train, sample_weight=sample_weights)

# train Histogram-based Gradient Boosting Classification Tree 
from sklearn.ensemble import HistGradientBoostingClassifier
HGBC = HistGradientBoostingClassifier(random_state=SEED)
HGBC.fit(X_train_OHE, y_train, sample_weight=sample_weights)

# Train light gbm
lgbm = lgb.LGBMClassifier(random_state=SEED)
lgbm.fit(X_train_withmissing, y_train, sample_weight=sample_weights)

# Train catboost
catb = cb.CatBoostClassifier(random_state=SEED, cat_features=cat_features,iterations=500, verbose=False)
catb.fit(X_train_withmissing, y_train, sample_weight=sample_weights)

### evaluate alternative models on the test set

### BRF: balanced random forest classifier 

In [None]:
# BRF: balanced random forest classifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import pandas as pd

# Get predictions
predictions = brf.predict_proba(X_test_OHE)
predictions = predictions[:, 1]
# print(predictions_class)
predictions_class = [True if x >= 0.5 else False for x in predictions]

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_class)

# Extract true positive, true negative, false positive, and false negative counts
tn, fp, fn, tp = cm.ravel()

# Calculate PPV, NPV, sensitivity, and specificity
PPV_test = tp / (tp + fp)
NPV_test = tn / (tn + fn)
sensitivity_test = tp / (tp + fn)
specificity_test = tn / (tn + fp)
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2
MCC_test = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
auc = roc_auc_score(y_test, predictions)

# Create a dictionary to hold the results
results = {
    'Metric': ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC',"AUC"],
    'Value': [PPV_test, NPV_test, sensitivity_test, specificity_test, balanced_accuracy_test, MCC_test,auc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_BRF = results_df.copy()
print(results_df)


In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Calculate ROC curve and AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)

# Set up the figure and axes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4.5))

# Plot ROC curve
ax1.plot(fpr, tpr, color='blue', label='AUC = %0.2f' % roc_auc)
ax1.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1])
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC curve')
ax1.legend(loc="lower right")

# Plot Confusion Matrix
ax2.set_title('Confusion matrix')
disp = ConfusionMatrixDisplay.from_estimator(HGBC, X_test_OHE, y_test, cmap = 'Blues', ax = ax2)

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Save the figure as a PNG file
plt.savefig('ROC_CM_BRF.png', dpi=300)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Calculate the weight for each sample based on the number of samples per patient
patient_sample_weights = testset_ID.value_counts().apply(lambda x: 1 / x)

# Apply the weights to the performance metrics calculations
weighted_ppv = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fp * patient_sample_weights)
weighted_npv = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fn * patient_sample_weights)
weighted_sensitivity = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fn * patient_sample_weights)
weighted_specificity = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fp * patient_sample_weights)
weighted_balanced_accuracy = (weighted_sensitivity + weighted_specificity) / 2
weighted_mcc = np.sum((tp * tn - fp * fn) * patient_sample_weights) / np.sum(((tp * patient_sample_weights + fp * patient_sample_weights) * (tp * patient_sample_weights + fn * patient_sample_weights) * (tn * patient_sample_weights + fp * patient_sample_weights) * (tn * patient_sample_weights + fn * patient_sample_weights)) ** 0.5)

# Create a dictionary to hold the results
results = {
    'Metric': ['Weighted PPV', 'Weighted NPV', 'Weighted Sensitivity', 'Weighted Specificity', 'Weighted Balanced Accuracy', 'Weighted MCC'],
    'Value': [weighted_ppv, weighted_npv, weighted_sensitivity, weighted_specificity, weighted_balanced_accuracy, weighted_mcc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_BRF_weighted = results_df.copy()
print(results_df)


### HGBC: Histogram-based Gradient Boosting Classification Tree

In [None]:
# HGBC: Histogram-based Gradient Boosting Classification Tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import pandas as pd

# Get predictions
predictions = HGBC.predict_proba(X_test_OHE)
predictions = predictions[:, 1]
# print(predictions_class)
predictions_class = [True if x >= 0.5 else False for x in predictions]

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_class)

# Extract true positive, true negative, false positive, and false negative counts
tn, fp, fn, tp = cm.ravel()

# Calculate PPV, NPV, sensitivity, and specificity
PPV_test = tp / (tp + fp)
NPV_test = tn / (tn + fn)
sensitivity_test = tp / (tp + fn)
specificity_test = tn / (tn + fp)
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2
MCC_test = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
auc = roc_auc_score(y_test, predictions)

# Create a dictionary to hold the results
results = {
    'Metric': ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC',"AUC"],
    'Value': [PPV_test, NPV_test, sensitivity_test, specificity_test, balanced_accuracy_test, MCC_test,auc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_HGBC = results_df.copy()
print(results_df)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Get predictions for the dev set
predictions = HGBC.predict_proba(X_test_OHE)
predictions = predictions[:, 1]
# print(predictions_class)
predictions_class = [True if x >= 0.5 else False for x in predictions]

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_class)
tn, fp, fn, tp = cm.ravel()

# Calculate the weight for each sample based on the number of samples per patient
patient_sample_weights = testset_ID.value_counts().apply(lambda x: 1 / x)

# Apply the weights to the performance metrics calculations
weighted_ppv = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fp * patient_sample_weights)
weighted_npv = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fn * patient_sample_weights)
weighted_sensitivity = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fn * patient_sample_weights)
weighted_specificity = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fp * patient_sample_weights)
weighted_balanced_accuracy = (weighted_sensitivity + weighted_specificity) / 2
weighted_mcc = np.sum((tp * tn - fp * fn) * patient_sample_weights) / np.sum(((tp * patient_sample_weights + fp * patient_sample_weights) * (tp * patient_sample_weights + fn * patient_sample_weights) * (tn * patient_sample_weights + fp * patient_sample_weights) * (tn * patient_sample_weights + fn * patient_sample_weights)) ** 0.5)

# Create a dictionary to hold the results
results = {
    'Metric': ['Weighted PPV', 'Weighted NPV', 'Weighted Sensitivity', 'Weighted Specificity', 'Weighted Balanced Accuracy', 'Weighted MCC'],
    'Value': [weighted_ppv, weighted_npv, weighted_sensitivity, weighted_specificity, weighted_balanced_accuracy, weighted_mcc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_HGBC_weighted = results_df.copy()
print(results_df)


In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Calculate ROC curve and AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)

# Set up the figure and axes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4.5))

# Plot ROC curve
ax1.plot(fpr, tpr, color='blue', label='AUC = %0.2f' % roc_auc)
ax1.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1])
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC curve')
ax1.legend(loc="lower right")

# Plot Confusion Matrix
ax2.set_title('Confusion matrix')
disp = ConfusionMatrixDisplay.from_predictions(y_true=y_test,y_pred=predictions_class, cmap = 'Blues', ax = ax2)

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Save the figure as a PNG file
plt.savefig('ROC_CM_HGBC.png', dpi=300)


### LightGBM

In [None]:
# light GBM
from sklearn.metrics import confusion_matrix
import pandas as pd

# Get predictions for the dev set
predictions = lgbm.predict_proba(X_test_withmissing)
predictions = predictions[:, 1]
# print(predictions_class)
predictions_class = [True if x >= 0.5 else False for x in predictions]

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_class)

# Extract true positive, true negative, false positive, and false negative counts
tn, fp, fn, tp = cm.ravel()

# Calculate PPV, NPV, sensitivity, and specificity
PPV_test = tp / (tp + fp)
NPV_test = tn / (tn + fn)
sensitivity_test = tp / (tp + fn)
specificity_test = tn / (tn + fp)
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2
MCC_test = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
auc = roc_auc_score(y_test, predictions)

# Create a dictionary to hold the results
results = {
    'Metric': ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC',"AUC"],
    'Value': [PPV_test, NPV_test, sensitivity_test, specificity_test, balanced_accuracy_test, MCC_test,auc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_LGBM = results_df.copy()
print(results_df)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Calculate the weight for each sample based on the number of samples per patient
patient_sample_weights = testset_ID.value_counts().apply(lambda x: 1 / x)

# Apply the weights to the performance metrics calculations
weighted_ppv = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fp * patient_sample_weights)
weighted_npv = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fn * patient_sample_weights)
weighted_sensitivity = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fn * patient_sample_weights)
weighted_specificity = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fp * patient_sample_weights)
weighted_balanced_accuracy = (weighted_sensitivity + weighted_specificity) / 2
weighted_mcc = np.sum((tp * tn - fp * fn) * patient_sample_weights) / np.sum(((tp * patient_sample_weights + fp * patient_sample_weights) * (tp * patient_sample_weights + fn * patient_sample_weights) * (tn * patient_sample_weights + fp * patient_sample_weights) * (tn * patient_sample_weights + fn * patient_sample_weights)) ** 0.5)

# Create a dictionary to hold the results
results = {
    'Metric': ['Weighted PPV', 'Weighted NPV', 'Weighted Sensitivity', 'Weighted Specificity', 'Weighted Balanced Accuracy', 'Weighted MCC'],
    'Value': [weighted_ppv, weighted_npv, weighted_sensitivity, weighted_specificity, weighted_balanced_accuracy, weighted_mcc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_LGBM_weighted = results_df.copy()
print(results_df)


In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Calculate ROC curve and AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)

# Set up the figure and axes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4.5))

# Plot ROC curve
ax1.plot(fpr, tpr, color='blue', label='AUC = %0.2f' % roc_auc)
ax1.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1])
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC curve')
ax1.legend(loc="lower right")

# Plot Confusion Matrix
ax2.set_title('Confusion matrix')
disp = ConfusionMatrixDisplay.from_predictions(y_true=y_test,y_pred=predictions_class, cmap = 'Blues', ax = ax2)

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Save the figure as a PNG file
plt.savefig('ROC_CM_LGBM.png', dpi=300)

### CATBoost

In [None]:
# CATBOOST
from sklearn.metrics import confusion_matrix
import pandas as pd

# Get predictions for the dev set
predictions = catb.predict_proba(X_test_withmissing)
predictions = predictions[:, 1]
# print(predictions_class)
predictions_class = [True if x >= 0.5 else False for x in predictions]
# predictions_class = predictions_class == "True"
# print(predictions_class)
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_class)

# Extract true positive, true negative, false positive, and false negative counts
tn, fp, fn, tp = cm.ravel()

# Calculate PPV, NPV, sensitivity, and specificity
PPV_test = tp / (tp + fp)
NPV_test = tn / (tn + fn)
sensitivity_test = tp / (tp + fn)
specificity_test = tn / (tn + fp)
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2
MCC_test = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
auc = roc_auc_score(y_test, predictions)

# Create a dictionary to hold the results
results = {
    'Metric': ['PPV', 'NPV', 'Sensitivity', 'Specificity', 'Balanced Accuracy', 'MCC',"AUC"],
    'Value': [PPV_test, NPV_test, sensitivity_test, specificity_test, balanced_accuracy_test, MCC_test,auc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_CB = results_df.copy()
print(results_df)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Calculate the weight for each sample based on the number of samples per patient
patient_sample_weights = testset_ID.value_counts().apply(lambda x: 1 / x)

# Apply the weights to the performance metrics calculations
weighted_ppv = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fp * patient_sample_weights)
weighted_npv = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fn * patient_sample_weights)
weighted_sensitivity = np.sum(tp * patient_sample_weights) / np.sum(tp * patient_sample_weights + fn * patient_sample_weights)
weighted_specificity = np.sum(tn * patient_sample_weights) / np.sum(tn * patient_sample_weights + fp * patient_sample_weights)
weighted_balanced_accuracy = (weighted_sensitivity + weighted_specificity) / 2
weighted_mcc = np.sum((tp * tn - fp * fn) * patient_sample_weights) / np.sum(((tp * patient_sample_weights + fp * patient_sample_weights) * (tp * patient_sample_weights + fn * patient_sample_weights) * (tn * patient_sample_weights + fp * patient_sample_weights) * (tn * patient_sample_weights + fn * patient_sample_weights)) ** 0.5)

# Create a dictionary to hold the results
results = {
    'Metric': ['Weighted PPV', 'Weighted NPV', 'Weighted Sensitivity', 'Weighted Specificity', 'Weighted Balanced Accuracy', 'Weighted MCC'],
    'Value': [weighted_ppv, weighted_npv, weighted_sensitivity, weighted_specificity, weighted_balanced_accuracy, weighted_mcc]
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

# Round the values to two decimal places
results_df['Value'] = results_df['Value'].round(2)
results_df_CB_weighted = results_df.copy()
print(results_df)


In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Calculate ROC curve and AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)

# Set up the figure and axes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4.5))

# Plot ROC curve
ax1.plot(fpr, tpr, color='blue', label='AUC = %0.2f' % roc_auc)
ax1.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1])
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC curve')
ax1.legend(loc="lower right")

# Plot Confusion Matrix
ax2.set_title('Confusion matrix')
disp = ConfusionMatrixDisplay.from_predictions(y_true=y_test,y_pred=predictions_class, cmap = 'Blues', ax = ax2)

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Save the figure as a PNG file
plt.savefig('ROC_CM_CB.png', dpi=300)

### model performance for the best performing model based on the whole data

LightGBM is chosen as best alternative model as it had the highest AUC on the test set

In [None]:
# Merge data frames based on the "Metric" column
merged_df = pd.merge(results_df_QLattice_weighted, results_df_BRF_weighted, on='Metric', suffixes=('_1', '_2'))
merged_df = pd.merge(merged_df, results_df_HGBC_weighted, on='Metric')
merged_df = pd.merge(merged_df, results_df_LGBM_weighted, on='Metric', suffixes=('_3', '_4'))
merged_df = pd.merge(merged_df, results_df_CB_weighted, on='Metric', suffixes=('_4', '_5'))

merged_df.columns = ["Measures","QLattice","BRF","HGBC","LGBM","CB"]
aggregated_results_test_all_weighted = merged_df.copy()

print(aggregated_results_test_all_weighted)
# Save the results to an Excel file
aggregated_results_test_all_weighted.to_excel('aggregated_results_test_all_weighted.xlsx', index=False)
del merged_df

In [None]:
# Merge data frames based on the "Metric" column
merged_df = pd.merge(results_df_QLattice, results_df_BRF, on='Metric', suffixes=('_1', '_2'))
merged_df = pd.merge(merged_df, results_df_HGBC, on='Metric')
merged_df = pd.merge(merged_df, results_df_LGBM, on='Metric', suffixes=('_3', '_4'))
merged_df = pd.merge(merged_df, results_df_CB, on='Metric', suffixes=('_4', '_5'))

merged_df.columns = ["Measures","QLattice","BRF","HGBC","LGBM","CB"]
aggregated_results_test_all = merged_df.copy()

print(aggregated_results_test_all)
# Save the results to an Excel file
aggregated_results_test_all.to_excel('aggregated_results_test_all.xlsx', index=False)

In [None]:
# selected_model = lgbm

# selected_model = HGBC
data = aggregated_results_test_all.copy()

# Get the average value for the numerical columns
average_values = data.iloc[5:, 2:].mean(axis=0) # QLattice excluded from comparison

# Get the name of the one with the highest average
highest_average = average_values.idxmax()

model_dictionary = {"BRF": brf,
                    "HGBC": HGBC,
                    "LGBM": lgbm,
                    "CB": catb
}
print("Selected Model:", highest_average)
selected_model =  model_dictionary[highest_average]

### SHAP values association with predicted probabilities

In [None]:
import matplotlib.pyplot as plt
import shap

# Calculate SHAP values for the positive class
positive_class_index = 1  # Adjust this index based on the class labels of your problem

if isinstance(selected_model, (HistGradientBoostingClassifier, BalancedRandomForestClassifier)):
    explainer = shap.TreeExplainer(selected_model)
    shap_values = explainer.shap_values(X_test_OHE)
else:
    explainer = shap.TreeExplainer(selected_model)
    shap_values = explainer.shap_values(X_test_withmissing)[positive_class_index]

# Calculate the sum of SHAP values for each sample
shap_sum = shap_values.sum(axis=1)

# Get the predicted probabilities of the model
if isinstance(selected_model, (HistGradientBoostingClassifier, BalancedRandomForestClassifier)):
    predicted_probabilities = selected_model.predict_proba(X_test_OHE)[:, positive_class_index]
else:
    predicted_probabilities = selected_model.predict_proba(X_test_withmissing)[:, positive_class_index]

    
# Plot the SHAP sum against the predicted probabilities
plt.scatter(shap_sum, predicted_probabilities)
plt.xlabel('Sum of SHAP values')
plt.ylabel('Predicted Probability')
plt.title('Sum of SHAP Values vs. Predicted Probability')
plt.show()


### interpret the model based on SHAP analysis

In [None]:
# %matplotlib notebook

import matplotlib.pyplot as plt
import shap

# Calculate the absolute SHAP values
abs_shap_values = np.abs(shap_values)

# Compute the feature importance based on the sum of absolute SHAP values
feature_importance = np.mean(abs_shap_values, axis=0)

# Create a DataFrame to store feature importance
# feature_importance_df = pd.DataFrame({'Feature': X_train.columns.tolist(), 'Importance': feature_importance})
if isinstance(selected_model, (HistGradientBoostingClassifier, BalancedRandomForestClassifier)):
    feature_importance_df = pd.DataFrame({'Feature': [data_dictionary.get(feature, feature) for feature in X_test_OHE.columns.tolist()], 'Importance': feature_importance})
else:
    feature_importance_df = pd.DataFrame({'Feature': [data_dictionary.get(feature, feature) for feature in X_test_withmissing.columns.tolist()], 'Importance': feature_importance})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

# Print the top 10 most important features
top_10_features = feature_importance_df.head(10)
print(top_10_features)
# Reverse the order of the sorted data
top_10_features = top_10_features[::-1]

# Reset the index of the DataFrame
top_10_features.reset_index(drop=True, inplace=True)

# Plot the top 10 most important features
plt.figure(figsize=(10, 6))

plt.barh(top_10_features.index, top_10_features['Importance'])
plt.yticks(top_10_features.index, top_10_features['Feature'])

# plt.barh(top_10_features['Feature'], top_10_features['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 10 Most Important Features')
plt.rcParams['figure.autolayout'] = True  # Automatically adjust the figure margins
# Save the figure as a PNG file
plt.savefig('feature_importance_shap_plot.png', dpi=300)

plt.show()


### SHAP summary plot

Note: the plot cannot show categorical features in color codes and thus they are plotted in grey (not mistaken with missing values)

In [None]:
# %matplotlib notebook

import matplotlib.pyplot as plt
import shap

# Retrieve feature names from the data dictionary
if isinstance(selected_model, (HistGradientBoostingClassifier, BalancedRandomForestClassifier)):
    feature_names_with_shapvalues = [
        data_dictionary.get(feature, feature) + ": " + str(round(value, 2))
        for feature, value in zip(X_test_OHE.columns, np.mean(np.abs(shap_values), axis=0)) # np.abs(shap_values).mean(axis=0)
    ]
    shap.summary_plot(shap_values, X_test_OHE, feature_names=feature_names_with_shapvalues, show=False, alpha = 0.8, max_display=10)

else:
        feature_names_with_shapvalues = [
        data_dictionary.get(feature, feature) + ": " + str(round(value, 2))
        for feature, value in zip(X_test_withmissing.columns, np.mean(np.abs(shap_values), axis=0)) # np.abs(shap_values).mean(axis=0)
    ]
        shap.summary_plot(shap_values, X_test_withmissing, feature_names=feature_names_with_shapvalues, show=False, alpha = 0.8, max_display=10)


# Save the figure as a PNG file
plt.savefig('shap_summary_top10_plot.png', dpi=300)
plt.rcParams['figure.autolayout'] = True  # Automatically adjust the figure margins

# Display the plot
plt.show()

