In [None]:
import pandas as pd 
import numpy as np 
import xgboost as xgb
import matplotlib.pyplot as plt 
#matplotlib.style.use('ggplot') 
from numpy import mean 
from numpy import sort 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import GridSearchCV, KFold
from sklearn import preprocessing 
from xgboost import XGBClassifier 
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from xgboost import plot_importance
from xgboost import cv 
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectFromModel 
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from matplotlib import pyplot
import shap

In [None]:
dtype = {
    'AGE_NEONATE': int,
    'AWEEKEND': int,
    'DIED': int,
    'ELECTIVE': int,
    'FEMALE': int,
    'HOSPBRTH': int,
    'NEOMAT': int,
    'ORPROC': int,
    'HCUP_ED': int,
    'Cancer': int,
    'Cardio_Circulatory': int,
    'Pneumo': int,
    'Gastro': int,
    'Renal': int,
    'Genital': int,
    'Pregnancy': int,
    'Bones_Joints_Chronic': int,
    'Congenital': int,
    'Myoskeletal_Acute': int,
    'Emergency': int,
    'NIS_STRATUM': int,
    'ZIPINC_QRTL': int,
    'PAY1': int,
    'PL_NCHS': int,
    'RACE': int,
    'DISPUNIFORM': int,
    'TRAN_IN': int,
    'TRAN_OUT': int,
    'AMONTH': int,
    'HOSP_DIVISION': int,
    'HOSP_NIS': int,
    'AGE': float,
    'LOS': float,
    'NCHRONIC': float,
    'NDX': float,
    'NECODE': float,
    'NPR': float,
    'TOTCHG': float,
    }

In [None]:
df=pd.read_csv('NIS 2013XGBoost_withoutNaN.csv', delimiter=';', na_values=' ')
df.drop('KEY_NIS', axis=1, inplace = True)
df.dropna(inplace=True)
for k, v in dtype.items():
    df[k] = df[k].astype(v)

In [None]:
df.columns = df.columns.str.strip()

In [None]:
#df.fillna('0', inplace=True)


In [None]:
print(df.columns)


In [None]:
#df['DIED'] = pd.to_numeric(df['DIED'], errors='coerce').fillna(0).astype('Int64')
df.dropna(subset=['DIED'], inplace=True)
#df['DIED'] = df['DIED'].astype('int64')

In [None]:
# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Display the DataFrame with full information
print("Columns and Data Types:")
print(df.dtypes)

In [None]:
value_distribution = df['TOTCHG'].value_counts()
value_distribution

In [None]:
fourth_quartile = df['TOTCHG'].quantile(0.75)
df['y'] = (df['TOTCHG'] > fourth_quartile).astype(int)
df['y'].value_counts()

In [None]:
df = df.drop(columns=['TOTCHG', 'DIED'])
df = df.select_dtypes(exclude=['object', 'category'])
X, y = df.drop('y', axis=1), df['y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [None]:
y_train.dtypes

In [None]:
X_train.dtypes

In [None]:
paramGrid = { 
    "learning_rate": [0.05], 
    "n_estimators": [400], 
    "max_depth": [6],  
    'subsample': [0.8], 
    "colsample_bylevel": [0.8],
    "colsample_bytree": [0.8],
    'gamma': [0.2]
}

In [None]:
model = XGBClassifier(objective="binary:logistic", use_label_encoder=False, random_state=42) 
cv = KFold(n_splits=5, shuffle=True)


In [None]:
gridsearch = GridSearchCV(model, paramGrid, scoring='roc_auc', cv=cv, verbose=2) 
fit = gridsearch.fit(X_train, y_train) 

In [None]:
print("Best ROC AUC score:", fit.best_score_)
print("Best parameters:", fit.best_params_)

In [None]:
best_model = fit.best_estimator_

y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [None]:
print(f'F1 Score: {f1:.4f}')
print(f'ROC-AUC: {roc_auc:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

In [None]:
xgb.plot_importance(best_model)
plt.show()

In [None]:
explainer = shap.TreeExplainer(best_model)

In [None]:
shap_values = explainer.shap_values(X_test)

# Visualize the SHAP values
shap.summary_plot(shap_values, X_test)


In [None]:
for x in X.columns: shap.dependence_plot(x, shap_values, X_test, interaction_index=None)