In [1]:
!pip install shap



In [None]:
!pip install plotly

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default='notebook'
py.init_notebook_mode(connected=True)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, 
                            classification_report,
                            roc_auc_score, roc_curve, auc, precision_recall_curve,
                            confusion_matrix)

from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')



pd.set_option('display.max_rows', 250)

In [None]:
raw_data = pd.read_csv("D:\panda\Patient_DL1.csv")
raw_data

In [None]:
raw_data.shape

In [None]:
raw_data.info(verbose=True)

In [None]:
raw_data.describe()

In [None]:
raw_data.isnull().sum(axis=0).sort_values(ascending=False)

In [None]:
print("Number of rows with missing values:", raw_data.isnull().any(axis=1).sum())

In [None]:
large_missing = raw_data.isnull().sum(axis=0).sort_values(ascending=False)[raw_data.isnull().sum(axis=0).sort_values(ascending=False) > 25000]

print("\nTotal features with more than", 25000, "missing values:", len(large_missing))

raw_data.drop(large_missing.index.tolist() + ['encounter_id', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'patient_id', 'hospital_id', 'readmission_status'], 
              axis=1,
             inplace = True)
raw_data

In [None]:
raw_data.nunique()

In [None]:
raw_data = raw_data[raw_data[['bmi', 'weight', 'height']].isna().sum(axis=1) == 0]
raw_data

In [None]:
# raw_data[['age','gender','hospital_death','bmi']].dropna(): This selects the columns 'age', 'gender', 'hospital_death', and 'bmi' from the DataFrame 'raw_data'. 
#By removing any rows with missing values using 'dropna()'. This ensures that only complete data is used for plotting.
# color="gender": This sets the 'gender' column as the grouping variable, allowing the histogram to display separate distributions for different genders using different colors.
# marginal="box": This adds a box plot along the margins of the histogram, providing a visual summary of the distribution of 'age' (on the x-axis) and 'hospital_death' (on the y-axis) for each gender group.
# hover_data=raw_data[['age','gender','hospital_death','bmi']].columns: This specifies that when hovering over a data point in the histogram, the values for all columns in the selected subset ('age', 'gender', 'hospital_death', and 'bmi') will be displayed as tooltips.
fig = px.histogram(raw_data[['age','gender','hospital_death','bmi']].dropna(), x="age", y="hospital_death", color="gender",
                   marginal="box", # or violin, rug
                   hover_data=raw_data[['age','gender','hospital_death','bmi']].columns)
fig.show()

In [None]:
# This line of code filters the DataFrame 'raw_data' to select only the rows where the 'gender' column is equal to 'F' (representing females).
# From the filtered DataFrame, it selects only the columns 'age' and 'hospital_death'.
# It groups the resulting DataFrame by the 'age' column. This means that the data will be aggregated based on unique age values.
# It calculates the mean of the 'hospital_death' column for each age group. 
# It resets the index of the resulting DataFrame. After grouping, the 'age' column becomes the index. 
age_death_F=raw_data[raw_data['gender']=='F'][['age','hospital_death']].groupby('age').mean().reset_index()
age_death_M=raw_data[raw_data['gender']=='M'][['age','hospital_death']].groupby('age').mean().reset_index()
from plotly.subplots import make_subplots
fig = make_subplots()
# `fig.add_trace(go.Scatter(x=age_death_F['age'], y=age_death_F['hospital_death'], name="Female patients"))`
#   - Adds a scatter plot to the figure using the `go.Scatter` object.
#   - `x=age_death_F['age']`: Sets the x-axis values to the 'age' column from the `age_death_F` DataFrame (representing female patients).
#   - `y=age_death_F['hospital_death']`: Sets the y-axis values to the 'hospital_death' column from the `age_death_F` DataFrame.
#   - `name="Female patients"`: Assigns a name to the trace for legend display.
fig.add_trace(
    go.Scatter(x=age_death_F['age'], y=age_death_F['hospital_death'], name="Female patients"))
fig.add_trace(
    go.Scatter(x=age_death_M['age'], y=age_death_M['hospital_death'],name="Male patients"))
fig.update_layout(
    title_text="<b>Average hospital death probability of patients<b>")
fig.update_xaxes(title_text="<b>patient age<b>")
#   - `secondary_y=False` indicates that this is the primary y-axis (in case you have multiple y-axes).
fig.update_yaxes(title_text="<b>Average Hospital Death</b>", secondary_y=False)
fig.show()

In [None]:
weight_df=raw_data[['weight','hospital_death','bmi']]
weight_df['weight']=weight_df['weight'].round(0)
# This line of code rounds the values in the 'bmi' column of the DataFrame 'weight_df' to the nearest whole number (0 decimal places).
weight_df['bmi']=weight_df['bmi'].round(0)
weight_death=weight_df[['weight','hospital_death']].groupby('weight').mean().reset_index()
bmi_death=weight_df[['bmi','hospital_death']].groupby('bmi').mean().reset_index()
fig = make_subplots(rows=1, cols=1, shared_yaxes=True)
fig.add_trace(
    go.Scatter(x=weight_death['weight'], y=weight_death['hospital_death'], name="Weight"),
)
fig.add_trace(
    go.Scatter(x=bmi_death['bmi'], y=bmi_death['hospital_death'], name="BMI"),
)
fig.update_layout(
    title_text="<b>impacts of BMI and weight over patients<b>"
)
fig.update_yaxes(title_text="<b>Average Hospital Death")
fig.show()

In [None]:
ICU_type=raw_data[['icu_type','age','hospital_death']]
# The purpose of this replacement is likely to consolidate similar ICU types under a common label, 
#potentially for better analysis or visualization. 
ICU_type['icu_type']=ICU_type['icu_type'].replace({'CTICU':'CCU-CTICU',
                                              'Cardiac ICU':'CCT-CTICU',
                                              'CTICU':'CCT-CTICU',
                                              'CSICU':'SICU'})
#ICU_type['pre_icu_los_days']=ICU_type['pre_icu_los_days'].round(0)
ICU_df=ICU_type.groupby(['icu_type','age']).mean().reset_index()
# This line calculates the count of occurrences for each combination of 'icu_type' and 'age', and 
#assigns these counts to a new column named 'count' in the 'ICU_df' DataFrame.
ICU_df['count']=ICU_type.groupby(['icu_type','age']).count().reset_index()['hospital_death']

# This line creates a scatter plot using Plotly Express, where:

# x="age": Sets the x-axis to represent the 'age' column.
# y="hospital_death": Sets the y-axis to represent the mean 'hospital_death' values calculated earlier.
# size="count": Scales the size of each data point based on the count of occurrences for that group.
# color="icu_type": Assigns different colors to data points based on their 'icu_type'.
# hover_name="icu_type": Displays the 'icu_type' when hovering over a data point.
# log_x=False: Disables logarithmic scaling for the x-axis.
# size_max=60: Sets the maximum size of the data points.

fig = px.scatter(ICU_df, x="age", y="hospital_death", size="count", color="icu_type",
           hover_name="icu_type", log_x=False, size_max=60,)
fig.update_layout(
    title_text="<b>Survival rate at different types of ICU<b>"
)
fig.update_yaxes(title_text="<b>Average Hospital Death<b>")
fig.update_xaxes(title_text="<b>Age<b>")
fig.show()

In [None]:
# This code snippet focuses on analyzing the relationship between patient age, 
#the APACHE III body system, and hospital death. 

# 1. Selection: It selects three columns from the 'raw_data' DataFrame: 'age', 'apache_3j_bodysystem', and 'hospital_death'.
#These columns are relevant for investigating how age and the affected body system might be associated with hospital death.

# 2. Grouping and Aggregation: It groups the selected data by 'apache_3j_bodysystem' and 'age'. 
#For each combination of body system and age, it calculates two aggregate values:
#     - 'size': The number of patients within that group.
#     - 'mean': The average hospital death rate for patients in that group.

# 3. Resetting Index: It resets the index of the resulting DataFrame. This makes the 'apache_3j_bodysystem' and 'age' columns
#regular data columns instead of index levels, facilitating further analysis or visualization.

# In summary, this code prepares the data to explore how the probability of hospital death varies across 
#different age groups and affected body systems as defined by APACHE III.
apache3=raw_data[['age','apache_3j_bodysystem','hospital_death']]
apache3=apache3.groupby(['apache_3j_bodysystem','age']).agg(['size','mean']).reset_index()
# Calculate size and mean of 'hospital_death' for each 'apache_3j_bodysystem'
apache3['size']=apache3['hospital_death']['size']
apache3['mean']=apache3['hospital_death']['mean']
# Remove the original 'hospital_death' column as it's now represented by 'size' and 'mean'
apache3.drop('hospital_death',axis=1,inplace=True)
# Extract unique body systems
systems =list(apache3['apache_3j_bodysystem'].unique())


# Step 1: Iterate through Systems
# The code iterates through each element ('s') in the list 'systems'. 'n' keeps track of the index of the current element.

# Step 2: Create Visibility List
# For each system, it creates a list called 'visible' filled with False values, equal in length to the number of systems. 
# Then, it sets the element at index 'n' to True. This is done to control the visibility of traces in a plot later on.

# Step 3: Build Update Dictionary
# A dictionary 'temp_dict' is created to define an update for a plot. It includes:
#    - 'label': A string representation of the current system ('s').
#    - 'method': Set to 'update', indicating that this dictionary will update an existing plot.
#    - 'args': A list of arguments for the update. It contains two dictionaries:
#        - The first dictionary sets the 'visible' attribute of traces based on the 'visible' list created earlier.
#        - The second dictionary sets the title of the plot to the current system ('s') in bold.

# Step 4: Append to Update List
# The 'temp_dict' is appended to a list called 'list_updatemenus'. This list will store all the update dictionaries 
#for the plot.

# Step 5: Create Masks (Outside the Loop)
# After the loop, the code iterates through 'systems' again. For each system 's', it creates a boolean mask called 'mask'.
# The mask is likely used to filter data based on the 'apache_3j_bodysystem' column in a DataFrame called 'apache3'. 
# The exact filtering logic is not shown in the provided code snippet.

# In Summary
# This code snippet is part of a larger process that creates an interactive plot where users can select 
#different systems to view. 
# The 'list_updatemenus' will be used to create dropdown menus or buttons to control the visibility of traces 
#based on the selected system. 
# The masks are likely used to filter data for each system, allowing the plot to display relevant information 
#for the chosen system.


data = []
list_updatemenus = []
for n, s in enumerate(systems):
    visible = [False] * len(systems)
    visible[n] = True
    temp_dict = dict(label = str(s),
                 method = 'update',
                 args = [{'visible': visible},
                         {'title': '<b>'+s+'<b>'}])
    list_updatemenus.append(temp_dict)
    

for s in systems:
    mask = (apache3['apache_3j_bodysystem'].values == s) 
    trace = (dict(visible = False,     
        x = apache3.loc[mask, 'age'],
        y = apache3.loc[mask, 'mean'],
        mode = 'markers',
        marker = {'size':apache3.loc[mask, 'size']/apache3.loc[mask,'size'].sum()*1000,
                 'color':apache3.loc[mask, 'mean'],
                 'showscale': True})
                   )
    data.append(trace)

data[0]['visible'] = True    
    
layout = dict(updatemenus=list([dict(buttons= list_updatemenus)]),
              xaxis=dict(title = '<b>Age<b>', range=[min(apache3.loc[:, 'age'])-10, max(apache3.loc[:, 'age']) + 10]),
              yaxis=dict(title = '<b>Average Hospital Death<b>', range=[min(apache3.loc[:, 'mean'])-0.1, max(apache3.loc[:, 'mean'])+0.1]),
              title='<b>Survival Rate<b>' )
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='update_dropdown')

In [None]:
numerical_cat = [
 'elective_surgery',
 'apache_post_operative',
 'arf_apache',
 'gcs_unable_apache',
 'intubated_apache',
 'ventilated_apache',
 'aids',
 'cirrhosis',
 'diabetes_mellitus',
 'hepatic_failure',
 'immunosuppression',
 'leukemia',
 'lymphoma',
 'solid_tumor_with_metastasis']

categorical = ['ethnicity',
 'gender',
 'icu_type',
 'apache_3j_bodysystem',
 'apache_2_bodysystem']

In [None]:
# This line of code identifies the columns in the DataFrame 'raw_data' that have only two unique values.
raw_data.nunique()[raw_data.nunique() == 2].index.tolist()

In [None]:
raw_data.select_dtypes(include='O').columns.values.tolist()

In [None]:
not_numeric = raw_data[numerical_cat + categorical + ['hospital_death']].columns.tolist()
numeric_only = raw_data.drop(not_numeric,axis=1).columns.tolist()
numeric_only

In [None]:
for col in numerical_cat:
    raw_data[col] = raw_data[col].astype('Int64')

for col in numerical_cat:
    raw_data[col] = raw_data[col].fillna(raw_data[col].mode()[0])

In [None]:
# This line calculates the number of missing values for each column in the DataFrame 'raw_data[numeric_only]' and sorts the result in ascending order.
# It then filters this sorted result to select only the columns with less than 11000 missing values.
# Finally, it extracts the names of these columns into a list called 'split_one'.
split_one = raw_data[numeric_only].isna().sum(axis=0).sort_values()[raw_data[numeric_only].isna().sum(axis=0) < 11000].index.tolist()
split_two = raw_data[numeric_only].isna().sum(axis=0).sort_values()[raw_data[numeric_only].isna().sum(axis=0) > 11000].index.tolist()

In [None]:
split_two

In [None]:
# Fill missing values in the current column with the mean of that column
for col in split_two:
    raw_data[col] = raw_data[col].fillna(raw_data[col].mean())
# Drop rows with any remaining missing values from 'raw_data' and store in 'process_data'
process_data = raw_data.dropna(axis=0)

In [None]:
process_data[categorical].nunique()

# using one-hot encoder because of large range of unique values in categories

In [None]:
# This line of code creates dummy variables for the categorical features in the 'process_data' DataFrame.
# pd.get_dummies(): This function is used to convert categorical variables into a set of binary (0 or 1) variables, one for each unique category.
# prefix='isin': This sets the prefix for the newly created dummy variable column names to 'isin'. This helps to identify the origin of these columns.
# prefix_sep='_': This sets the separator between the prefix ('isin') and the original category name in the dummy variable column names.
# columns=categorical: This specifies the list of categorical columns in 'process_data' for which dummy variables should be created.
# drop_first=False: This indicates that the first dummy variable for each categorical feature should not be dropped. By default, 
#'pd.get_dummies' drops the first dummy variable to avoid multicollinearity, but in this case, all dummy variables are kept.
icu_data = pd.get_dummies(process_data,
    prefix='isin',
    prefix_sep='_',
    columns=categorical,
    drop_first=False)
# This line of code resets the index of the 'icu_data' DataFrame and drops the original index.
# reset_index(drop=True): This method resets the index to a default integer index (0, 1, 2, ...) and 
#removes the original index.
# inplace=True: This modifies the 'icu_data' DataFrame directly without creating a copy.
icu_data.reset_index(drop = True, inplace = True)
icu_data

In [None]:
# This line of code iterates through each column name in the DataFrame 'icu_data' and 
#converts it to lowercase using a list comprehension. 
# The resulting list of lowercase column names is then assigned back to 'icu_data.columns', effectively renaming 
#all columns to lowercase.
icu_data.columns = [x.lower() for x in icu_data.columns.tolist()]
# This line of code removes duplicate columns from the DataFrame 'icu_data'. 
# It uses the loc accessor to select all rows and only the non-duplicate columns. 
# The ~ operator inverts the boolean mask returned by 'icu_data.columns.duplicated()', 
#selecting the columns that are not duplicates.
icu_data = icu_data.loc[:,~icu_data.columns.duplicated()]

In [None]:
# This line of code assigns the data type of the 'arf_apache' column in the 'icu_data' DataFrame to the variable 't'. 
# This essentially stores the information about what kind of values are present in the 'arf_apache' column, 
#such as integers, floating-point numbers, strings, etc.

t = icu_data['arf_apache'].dtype

# This code iterates through each column ('col') in the 'icu_data' DataFrame. 
# For each column, it checks if its data type is either 'uint8' (unsigned 8-bit integer) or the same as the data type 
#stored in the variable 't' (which represents the data type of the 'arf_apache' column).
# If either condition is true, it converts the data type of that column to 'int' (integer). 
#This is likely done to ensure consistency in data types across the DataFrame, especially if the columns with 'uint8' or the same data type as 'arf_apache' are meant to represent numerical values.
for col in tqdm(icu_data.columns.tolist()):
    if icu_data[col].values.dtype == 'uint8' or t == icu_data[col].values.dtype:
        icu_data[col] = icu_data[col].astype(int)

In [None]:
icu_data.dtypes

In [None]:
X = icu_data.drop(['hospital_death'], axis=1)
y = icu_data['hospital_death']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,
                                                    random_state=11,
                                                    stratify = y)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
def modelling(X_train, y_train, X_test, y_test, **kwargs):#     - `**kwargs`: Keyword arguments to control model selection.
    scores = {}
    models = []
    if 'xgb' in kwargs.keys() and kwargs['xgb']:
        #if the keyword argument 'xgb' is present and set to True. If so, it proceeds to train an XGBoost model.
        xgb = XGBClassifier()
        xgb.fit(X_train._get_numeric_data(), np.ravel(y_train, order='C'))
        # extracts the numerical features from the training data.
        #np.ravel(y_train, order='C'-------------This flattens the target labels into a 1D array.
        y_pred = xgb.predict(X_test._get_numeric_data())
        scores['xgb']= [accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred)]
#         scores['xgb']['roc_auc'] = roc_auc_score(y_test, y_pred)
        models.append(xgb)

    if 'rf' in kwargs.keys() and kwargs['rf']:
        rf = RandomForestClassifier(n_estimators=200)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        scores['rf']= [accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred)]
#         scores['rf']['roc_auc'] = roc_auc_score(y_test, y_pred)
        models.append(rf)

    if 'extree' in kwargs.keys() and kwargs['extree']:
        extree = ExtraTreesClassifier()
        extree.fit(X_train, y_train)
        y_pred = extree.predict(X_test)
        scores['extree'] = [accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred)]
#         scores['extree']['roc_auc'] = roc_auc_score(y_test, y_pred)
        models.append(extree)

    return scores

In [None]:
modelling(X_train,y_train, X_test, y_test, xgb=True, rf=True, extree=True)

In [None]:
# - `model`: The trained machine learning model.
# - `y_test`: The true labels of the test data.
# - `y_hat`: The predicted labels from the model.
def model_performance(model, y_test, y_hat) :
    conf_matrix = confusion_matrix(y_test, y_hat)
    trace1 = go.Heatmap(z = conf_matrix  ,x = ["0 (pred)","1 (pred)"],
                        y = ["0 (true)","1 (true)"],xgap = 2, ygap = 2,
                        colorscale = 'Viridis', showscale  = False)

    #Show metrics
    tp = conf_matrix[1,1]
    fn = conf_matrix[1,0]
    fp = conf_matrix[0,1]
    tn = conf_matrix[0,0]
   # Accuracy:
#   - Calculated as (tp + tn) / (tp + tn + fp + fn)
#   - Represents the proportion of correctly classified instances 
#(both true positives and true negatives) out of all instances.

# Precision:
#   - Calculated as tp / (tp + fp)
#   - Represents the proportion of true positive predictions out of all positive predictions made by the model.
#   - Focuses on minimizing false positives (incorrectly predicting positive when it's actually negative).

# Recall (Sensitivity):
#   - Calculated as tp / (tp + fn)
#   - Represents the proportion of true positive predictions out of all actual positive instances.
#   - Focuses on minimizing false negatives (incorrectly predicting negative when it's actually positive).

# F1-score:
#   - Calculated as 2 * ((precision * recall) / (precision + recall))
#   - Provides a harmonic mean between precision and recall, balancing both metrics.
#   - Useful when there's an uneven class distribution or when both false positives and false negatives are important to consider.
    Accuracy  =  ((tp+tn)/(tp+tn+fp+fn))
    Precision =  (tp/(tp+fp))
    Recall    =  (tp/(tp+fn))
    F1_score  =  (2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))))

    show_metrics = pd.DataFrame(data=[[Accuracy , Precision, Recall, F1_score]])
    show_metrics = show_metrics.T

    colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
    # trace2: This part of the code is responsible for creating a bar chart visualization of the calculated performance metrics (Accuracy, Precision, Recall, and F1-score).

# go.Bar: This indicates that we are using the Bar chart object from the Plotly Graph Objects library to create the visualization.

# x = (show_metrics[0].values): This assigns the actual metric values from the 'show_metrics' DataFrame to the x-axis 
#of the bar chart. It takes the values from the first column (index 0) of the DataFrame.

# y = ['Accuracy', 'Precision', 'Recall', 'F1_score']: This sets the labels for each bar on the y-axis, 
#corresponding to the different metrics being visualized.

# text = np.round_(show_metrics[0].values,4): This adds text labels to each bar, displaying the rounded metric values 
#(up to 4 decimal places) for better readability.

# textposition = 'auto': This automatically positions the text labels within the bars for optimal visibility.

# orientation = 'h': This sets the orientation of the bars to horizontal, meaning the bars will extend horizontally 
#from the y-axis.

# opacity = 0.8: This controls the transparency of the bars, making them slightly see-through.
    trace2 = go.Bar(x = (show_metrics[0].values),
                   y = ['Accuracy', 'Precision', 'Recall', 'F1_score'], text = np.round_(show_metrics[0].values,4),
                    textposition = 'auto',
                   orientation = 'h', opacity = 0.8,marker=dict(
            color=colors,
            line=dict(color='#000000',width=1.5)))

    #Roc curve
    model_roc_auc = round(roc_auc_score(y_test, y_hat) , 3)#Calculate ROC AUC score
    # False Positive Rate on x-axis
    # True Positive Rate on y-axis
    fpr, tpr, t = roc_curve(y_test, y_hat)# Calculate ROC curve values
    # Create ROC curve trace
    trace3 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),# Name the trace with ROC AUC score
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2), fill='tozeroy')
                        # Fill area under the curve,# Set line color and width
    # Create diagonal line trace (random classifier)
    trace4 = go.Scatter(x = [0,1],y = [0,1],
                        line = dict(color = ('black'),width = 1.5,
                        dash = 'dot'))

    # Precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_hat)
    trace5 = go.Scatter(x = recall, y = precision,
                        name = "Precision" + str(precision),
                        line = dict(color = ('lightcoral'),width = 2), fill='tozeroy')


    #plots
    model = model

    #Subplots
    #This function from the Plotly Tools library is used to create a figure with multiple subplots.
    # specs=[
#     [{}, {}],
#     [{}, {}],
# ]: This argument defines the layout and structure of the subplots within the grid. 
#Each inner list represents a row, and each dictionary within a list represents a subplot
    fig = tls.make_subplots(rows=2, cols=2, print_grid=False,
                          specs=[
#                               [{'colspan': 2}, None],
                                 [{}, {}],
                                 [{}, {}],

#                                  [{'colspan': 2}, None]
                                ],
                            # subplot_titles=('Confusion Matrix', ...): Provides titles for each subplot. 
                            #In this case, the first subplot is titled 'Confusion Matrix', 
                            #and the titles for the remaining subplots would be specified in the subsequent positions.
                          subplot_titles=('Confusion Matrix',
                                        'Metrics',
                                        'ROC curve'+" "+ '('+ str(model_roc_auc)+')',
                                        'Precision - Recall curve',
                                        ))

    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)
    fig.append_trace(trace3,2,1)
    fig.append_trace(trace4,2,1)
    fig.append_trace(trace5,2,2)

    fig['layout'].update(showlegend = False, title = '<b>Model performance report</b><br>'+str(model),
                        autosize = False, height = 1500,width = 830,
                        plot_bgcolor = 'rgba(240,240,240, 0.95)',
                        paper_bgcolor = 'rgba(240,240,240, 0.95)',
                        margin = dict(b = 195))
    fig.layout.titlefont.size = 14

    py.iplot(fig)

## Parameter Tuning


### Parameter tuning, also known as hyperparameter optimization, is crucial in machine learning for several reasons:

###  1. Performance Optimization: 
#####     - Machine learning models have hyperparameters that control their learning process and complexity. 
#####     - Proper tuning finds the optimal combination of these hyperparameters, leading to significant improvements in model performance (accuracy, precision, recall, etc.).

###  2. Generalization: 
#####    - Overly complex models can overfit the training data, performing well on seen data but poorly on new, unseen data. 
#####   - Parameter tuning helps find the right balance between model complexity and generalization, ensuring the model performs well on both training and unseen data.

###  3. Efficiency: 
#####     - Some hyperparameters affect the computational cost of training and prediction. 
#####     - Tuning can help find efficient settings that reduce training time and resource usage without sacrificing performance.

###  4. Robustness: 
#####    - Different datasets and problem domains may require different hyperparameter settings. 
#####     - Tuning ensures the model is robust and adaptable to various scenarios.

#####  In the provided code, GridSearchCV is used for parameter tuning:

##### - It systematically explores a defined hyperparameter grid (params).
#####  - It evaluates model performance using cross-validation (gkf) and the chosen metric (AUC).
#####  - It identifies the best hyperparameter combination (gsearch.best_params_) that maximizes performance.

#####  By investing time in parameter tuning, you can unlock the full potential of your machine learning models, achieving better accuracy, generalization, and efficiency.


In [None]:
# KFold: This is a cross-validation technique used to split the training data into multiple folds (subsets).
# n_splits=3: Specifies that the data will be divided into 3 folds.
# shuffle=True: Indicates that the data will be shuffled randomly before splitting. This helps ensure that the folds are representative of the overall data distribution.
# random_state=42: Sets a seed for the random number generator, ensuring reproducibility of the splits.
# .split(X=X_train, y=y_train): Applies the KFold splitting strategy to the training data (X_train and y_train).
#The resulting 'gkf' object can be used to iterate through the different folds during model training and evaluation.
gkf = KFold(n_splits=3, shuffle=True, random_state=42).split(X=X_train, y=y_train)
# fit_params_of_xgb: This dictionary contains parameters that control the training process of the XGBoost model.
# early_stopping_rounds=100: Specifies that training will stop if the model's performance on the evaluation 
#set doesn't improve for 100 consecutive rounds. This helps prevent overfitting.
# eval_metric='auc': Sets the evaluation metric to be the Area Under the Receiver Operating Characteristic Curve (AUC), 
#a common metric for binary classification problems.
# eval_set=[(X_test, y_test)]: Specifies the test data (X_test and y_test) as 
#the evaluation set to monitor performance during training.
# verbose=100: Controls the level of verbosity during training, printing progress updates every 100 rounds.
fit_params_of_xgb = {
    "early_stopping_rounds":100,
    "eval_metric" : 'auc',
    "eval_set" : [(X_test, y_test)],
    'verbose': 100,
}


# A parameter grid for XGBoost
# booster=["gbtree"]: Sets the booster type to "gbtree", which is the gradient boosting tree algorithm.
# learning_rate=[0.1]: Specifies the learning rate, controlling the step size at each iteration of boosting.
# n_estimators=range(100, 500, 100): Defines the range of number of estimators (trees) to try, 
#from 100 to 500 with a step of 100.
# min_child_weight=[1, ...]: Specifies the minimum sum of instance weight (hessian) needed in a child node. 
#This parameter controls the complexity of individual trees.
params = {
    'booster': ["gbtree"],
    'learning_rate': [0.1],
    'n_estimators': range(100, 500, 100),
    'min_child_weight': [1],
    'gamma': [0],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'max_depth': [5],
    "scale_pos_weight": [1]
}
# **XGBoost Model Training and Hyperparameter Tuning**
# Initialize XGBoost Classifier
xgb_estimator = XGBClassifier(
    objective='binary:logistic',
    # silent=True,
)

gsearch = GridSearchCV(
    estimator=xgb_estimator, # The XGBoost model to tune
    param_grid=params,# The hyperparameter grid to search over
    scoring='roc_auc',#Use AUC as the evaluation metric
    n_jobs=-1,# Use all available CPU cores for parallel processing
    cv=gkf# Use the defined KFold cross-validation strategy
)

xgb_model = gsearch.fit(X=X_train, y=y_train, **fit_params_of_xgb)
(gsearch.best_params_, gsearch.best_score_)

In [None]:
xgb_tuned = XGBClassifier(n_estimators=3000,
    objective='binary:logistic',
    booster="gbtree",
    learning_rate=0.01,
    scale_pos_weight=1,
    max_depth=4,
    min_child_weight=6,
    gamma=0,
    subsample=0.4,
    colsample_bytree=0.8,
    reg_alpha=0.08,
    n_jobs=-1)

xgb_tuned.fit(X_train._get_numeric_data(), np.ravel(y_train, order='C'))
y__hat = xgb_tuned.predict(X_test._get_numeric_data())

In [None]:
model_performance(xgb_tuned,y_test, y__hat)

In [None]:
y__hat[21:30]

In [None]:
X_test_sample = X_test.sample(2500)
X_test_sample

###### SHAP is a game theoretic approach to explain the output of any machine learning model.
###### SHAP can be applied to any machine learning model, regardless of its complexity or structure.
###### It provides individual explanations for each prediction, highlighting the contribution of each feature to that specific prediction.
###### By aggregating local explanations, SHAP can provide insights into the overall behavior of the model and identify important features.
###### SHAP values can be used to determine the relative importance of different features in the model's predictions.
###### SHAP offers various visualization tools to help interpret the model's behavior and understand the impact of individual features.


In [None]:
import shap

In [None]:
shap.initjs()

In [None]:
# Assuming 'xgb_tuned' is your trained XGBoost model and 'X_test_sample' is a sample of your test data
# Create a TreeExplainer object
# Calculate SHAP values for the test sample
# 'shap_values' now contains the SHAP values for each feature and instance in 'X_test_sample'.
# These values represent the contribution of each feature to the model's prediction for each instance.
# - Summary plot to visualize feature importance and impact:
shap_values = shap.TreeExplainer(xgb_tuned).shap_values(X_test_sample)

In [None]:
shap.summary_plot(shap_values, X_test_sample, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X_test_sample)

In [None]:
# Visualize the first prediction's explanation with a force plot

# - Force plot to explain individual predictions:
shap.force_plot(shap.TreeExplainer(xgb_tuned).expected_value, shap_values[0,:], X_test_sample.iloc[0,:])

# Visualize the first 257 predictions' explanations with a force plot
shap.force_plot(shap.TreeExplainer(xgb_tuned).expected_value, shap_values[0:257,:], X_test_sample.iloc[0:257,:])

In [None]:
print(y__hat[50])
shap.force_plot(shap.TreeExplainer(xgb_tuned).expected_value, shap_values[50], X_test_sample.iloc[50])

In [None]:
print(y__hat[21])
shap.force_plot(shap.TreeExplainer(xgb_tuned).expected_value, shap_values[21], X_test_sample.iloc[21])

In [None]:
print(y__hat[23])
shap.force_plot(shap.TreeExplainer(xgb_tuned).expected_value, shap_values[23], X_test_sample.iloc[23])

In [None]:
print(y__hat[60])
shap.force_plot(shap.TreeExplainer(xgb_tuned).expected_value, shap_values[60], X_test_sample.iloc[60])

In [None]:
# - Dependence plots to show the relationship between feature values and SHAP values:
shap.dependence_plot('apache_4a_icu_death_prob', shap_values, X_test_sample)

In [None]:
# Check for non-numeric columns in X_test_sample
non_numeric_cols = X_test_sample.select_dtypes(exclude=['number']).columns
print(non_numeric_cols)

# Convert non-numeric columns to numeric if appropriate
# For example, if a column contains strings that represent numbers,
# you can use the to_numeric() function:
# X_test_sample['column_name'] = pd.to_numeric(X_test_sample['column_

In [None]:
# Convert non-numeric columns to numeric if appropriate
for col in X_test_sample.select_dtypes(include=['object']):
    try:
        X_test_sample[col] = pd.to_numeric(X_test_sample[col], errors='coerce')
        # 'coerce' will replace non-convertible values with NaN
    except:
        print(f"Could not convert column {col} to numeric.")

# Fill any missing values (NaN) with a suitable strategy, like the median:
X_test_sample = X_test_sample

In [None]:
# shap.decision_plot: This function from the SHAP library creates a decision plot that visualizes the decision path 
#taken by the model for specific instances.
# By analyzing the decision plot, you can gain insights into how the model arrived at its predictions for the selected 
#instances. You can identify the key features that influenced the decisions and understand the direction and 
#magnitude of their impact.
# feature_names=X_test_sample.columns.tolist(): This provides the names of the features, which will be displayed on the plot 
#to identify the contributing factors.
# shap_values[110:130]: This selects a subset of SHAP values for instances 110 to 130 from the test sample. 
#These values represent the contribution of each feature to the model's prediction for these specific instances.
# shap.TreeExplainer(xgb_tuned).expected_value[0]: This provides the base value or expected prediction of the model, 
#typically the average prediction over the training data. It serves as the starting point for the decision path.
explainer = shap.TreeExplainer(xgb_tuned)
shap_values = explainer.shap_values(X_test_sample.iloc[110:130])  # Get SHAP values for the specified range of samples

shap.decision_plot(explainer.expected_value,
                   shap_values,
                   feature_names=X_test_sample.columns.tolist())