In [1]:
# Initial imports.
from config import db_password
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import matplotlib.pyplot 
from scipy import stats
import matplotlib.pyplot as plt
from imblearn.ensemble import BalancedRandomForestClassifier

In [2]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [3]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Patient_database"

In [4]:
# Create our session (link) from Python to the DB

engine=create_engine(db_string)

In [5]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [6]:
test_df = pd.read_sql_query('Select * From patient', con=engine)

In [7]:
# Save references to each table

test_df.head(10)


Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,icu_admit_source,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154.0,25312.0,118.0,False,68,22.73,False,2,1,1,...,False,False,True,False,False,False,False,False,9,0
1,114252.0,59342.0,81.0,False,77,27.42,False,2,0,1,...,False,False,True,False,False,False,False,False,8,6
2,119783.0,50777.0,118.0,False,25,31.95,False,2,0,0,...,False,False,False,False,False,False,False,False,5,3
3,79267.0,46918.0,118.0,False,81,22.64,True,2,0,2,...,False,False,False,False,False,False,False,False,0,0
4,33181.0,74489.0,83.0,False,67,27.56,False,2,1,0,...,False,False,True,False,False,False,False,False,7,4
5,82208.0,49526.0,83.0,False,59,57.45,False,2,0,0,...,False,False,True,False,False,False,False,False,8,6
6,42871.0,90749.0,118.0,False,50,25.71,False,6,1,0,...,False,False,False,False,False,False,False,False,0,0
7,105427.0,125898.0,77.0,False,72,28.257052,True,3,0,2,...,False,False,False,False,True,False,False,False,8,6
8,108388.0,98174.0,118.0,False,81,38.189067,True,2,1,2,...,False,False,False,False,False,False,False,False,8,6
9,22471.0,112115.0,118.0,False,46,25.845717,False,3,1,0,...,False,False,False,False,False,False,False,False,8,6


In [8]:
##file_path = Path("C:/Users/emili/Class/Patient_Survival_Prediction/Machine_Learning/Resources/database_patient.csv")
##df_database_patient = pd.read_csv(file_path)
##df_database_patient.head()

In [9]:
df_database_patient=df_database_patient.drop(columns=['patient_id', 'hospital_id', 'encounter_id'])

NameError: name 'df_database_patient' is not defined

In [None]:
# Create our features

X = df_database_patient.drop('hospital_death', axis=1)


X.head()

In [None]:
X.describe

In [None]:
#Create our target 

y = df_database_patient['hospital_death']

y.value_counts()

In [None]:
# Split data into Training and Testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Check balances
print(Counter(y_train))
print(Counter(y_test))

In [None]:
# Instantiate RBC

brfc = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# Fit

brfc.fit(X_train, y_train)

In [None]:
y_pred = brfc.predict(X_test)
RandomForest_accuracy_database = balanced_accuracy_score(y_test, y_pred)

RandomForest_accuracy_database

In [None]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)


# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Survival", "Actual Death"], columns=["Predicted Survival", "Predicted Death"])

cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

In [None]:
# Plot Features 

feature_importance = brfc.feature_importances_
sorted_idx = np.argsort(feature_importance)
fit = plt.figure(figsize=(50,35))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance All Vitals')