<a href="https://colab.research.google.com/github/TusharSinghal2004/medical_app/blob/main/pulse_patrol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the diagnoses_icd and admissions CSV files
diagnoses = pd.read_csv('DIAGNOSES_ICD.csv')
admissions = pd.read_csv('ADMISSIONS.csv')

# Filter for heart failure using ICD-9 code starting with '428'
heart_failure_diag = diagnoses[diagnoses['icd9_code'].astype(str).str.startswith('428')]

# Merge with admissions to get more patient info
heart_failure_records = pd.merge(heart_failure_diag, admissions, on='hadm_id', how='inner')

# Display a sample of heart failure records
print(heart_failure_records.head())

   row_id_x  subject_id_x  hadm_id  seq_num icd9_code  row_id_y  subject_id_y  \
0    112349         10006   142345        6      4280     12258         10006   
1    112513         10027   199395        1      4280     12278         10027   
2    112544         10029   132349       14      4280     12280         10029   
3    112566         10032   140372        4     42830     12282         10032   
4    112567         10032   140372        5      4280     12282         10032   

             admittime            dischtime deathtime  ... insurance language  \
0  2164-10-23 21:09:00  2164-11-01 17:15:00       NaN  ...  Medicare      NaN   
1  2190-07-13 07:15:00  2190-07-25 14:00:00       NaN  ...  Medicare      NaN   
2  2139-09-22 10:58:00  2139-10-02 14:29:00       NaN  ...  Medicare      NaN   
3  2138-04-02 19:52:00  2138-04-15 14:35:00       NaN  ...  Medicare      NaN   
4  2138-04-02 19:52:00  2138-04-15 14:35:00       NaN  ...  Medicare      NaN   

            religion marit

data cleaining and data validation

In [2]:
import pandas as pd
#Convert Date Columns to Datetime
# Assuming your extracted dataframe is heart_failure_records from your merge code
heart_failure_records['admittime'] = pd.to_datetime(heart_failure_records['admittime'])
heart_failure_records['dischtime'] = pd.to_datetime(heart_failure_records['dischtime'])


In [3]:


# Assuming you already have heart_failure_records from your merge
# Use subject_id_x as the unified patient ID and drop subject_id_y
heart_failure_records['subject_id'] = heart_failure_records['subject_id_x']
heart_failure_records.drop(columns=['subject_id_x', 'row_id_y'], inplace=True, errors='ignore')

# Rename columns if needed (for clarity)
heart_failure_records.rename(columns={'hadm_id': 'hospital_admission_id'}, inplace=True)

# Display a sample of the merged records
print(heart_failure_records.head())


   row_id_x  hospital_admission_id  seq_num icd9_code  subject_id_y  \
0    112349                 142345        6      4280         10006   
1    112513                 199395        1      4280         10027   
2    112544                 132349       14      4280         10029   
3    112566                 140372        4     42830         10032   
4    112567                 140372        5      4280         10032   

            admittime           dischtime deathtime admission_type  \
0 2164-10-23 21:09:00 2164-11-01 17:15:00       NaN      EMERGENCY   
1 2190-07-13 07:15:00 2190-07-25 14:00:00       NaN       ELECTIVE   
2 2139-09-22 10:58:00 2139-10-02 14:29:00       NaN      EMERGENCY   
3 2138-04-02 19:52:00 2138-04-15 14:35:00       NaN      EMERGENCY   
4 2138-04-02 19:52:00 2138-04-15 14:35:00       NaN      EMERGENCY   

          admission_location  ... language           religion marital_status  \
0       EMERGENCY ROOM ADMIT  ...      NaN           CATHOLIC      SEPAR

In [4]:
# Filter out records where dischtime is before admittime
# Validate Date Consistency
heart_failure_records = heart_failure_records[heart_failure_records['dischtime'] >= heart_failure_records['admittime']]
print("After date validation:", heart_failure_records.shape)


After date validation: (61, 22)


handling multiple admissions per patient

In [5]:
# Sort records by patient and admission time
heart_failure_records = heart_failure_records.sort_values(by=['subject_id', 'admittime'])


In [6]:
#computing 30 day readmisson plan
# Initialize the readmission flag
heart_failure_records['readmission'] = 0

# Group by subject_id
grouped = heart_failure_records.groupby('subject_id')

def mark_readmission(df):
    df = df.sort_values(by='admittime').reset_index(drop=True)
    # Loop over all but the last admission
    for i in range(len(df) - 1):
        discharge = df.loc[i, 'dischtime']
        next_admission = df.loc[i + 1, 'admittime']
        # Mark as readmission if next admission occurs within 30 days
        if (next_admission - discharge).days <= 30:
            df.loc[i, 'readmission'] = 1
    return df

heart_failure_records = grouped.apply(mark_readmission).reset_index(drop=True)

# Check sample output
print(heart_failure_records[['subject_id', 'admittime', 'dischtime', 'readmission']].head(10))


   subject_id           admittime           dischtime  readmission
0       10006 2164-10-23 21:09:00 2164-11-01 17:15:00            0
1       10027 2190-07-13 07:15:00 2190-07-25 14:00:00            0
2       10029 2139-09-22 10:58:00 2139-10-02 14:29:00            0
3       10032 2138-04-02 19:52:00 2138-04-15 14:35:00            1
4       10032 2138-04-02 19:52:00 2138-04-15 14:35:00            0
5       10038 2144-02-09 17:53:00 2144-02-21 13:30:00            1
6       10038 2144-02-09 17:53:00 2144-02-21 13:30:00            0
7       10040 2147-02-23 11:43:00 2147-02-27 16:19:00            0
8       10042 2147-02-06 12:38:00 2147-02-17 19:00:00            0
9       10043 2185-04-14 00:23:00 2185-04-26 18:20:00            1


  heart_failure_records = grouped.apply(mark_readmission).reset_index(drop=True)


handling missing values

In [7]:
# Check for missing values in date and ICD-9 code columns
missing_summary = heart_failure_records[['admittime', 'dischtime', 'icd9_code']].isnull().sum()
print("Missing values in key columns:\n", missing_summary)

# Drop rows with missing critical values
heart_failure_records = heart_failure_records.dropna(subset=['admittime', 'dischtime', 'icd9_code'])


Missing values in key columns:
 admittime    0
dischtime    0
icd9_code    0
dtype: int64


In [8]:
#data consistency and integrity
# Check for unique patients and admission counts
unique_patients = heart_failure_records['subject_id'].nunique()
print("Unique patients:", unique_patients)

# Optionally, inspect distribution of readmission flag
print(heart_failure_records['readmission'].value_counts())


Unique patients: 35
readmission
0    37
1    24
Name: count, dtype: int64


feature engineering

In [9]:
import pandas as pd

# 1. Compute Length of Stay (LOS)
heart_failure_records['length_of_stay'] = (heart_failure_records['dischtime'] - heart_failure_records['admittime']).dt.days

# 2. Convert ED times to datetime and calculate ED Wait Time (in hours)
heart_failure_records['edouttime'] = pd.to_datetime(heart_failure_records['edouttime'], errors='coerce')
heart_failure_records['edregtime'] = pd.to_datetime(heart_failure_records['edregtime'], errors='coerce')
heart_failure_records['ed_wait_time'] = (heart_failure_records['edouttime'] - heart_failure_records['edregtime']).dt.total_seconds() / 3600.0

# 3. Extract temporal features from admission time
heart_failure_records['admission_dayofweek'] = heart_failure_records['admittime'].dt.dayofweek
heart_failure_records['admission_hour'] = heart_failure_records['admittime'].dt.hour

# 4. Calculate the number of prior admissions per patient
heart_failure_records['prior_admissions'] = heart_failure_records.groupby('subject_id').cumcount()

# 5. Encode categorical features if they exist
categorical_features = ['admission_type', 'admission_location', 'insurance', 'religion', 'marital_status', 'ethnicity']

# Check which of these features exist in the DataFrame
available_cat_features = [col for col in categorical_features if col in heart_failure_records.columns]
if available_cat_features:
    heart_failure_records = pd.get_dummies(heart_failure_records, columns=available_cat_features, drop_first=True)
else:
    print("None of the specified categorical features are found in the DataFrame.")

# Additionally, encode 'gender' if it exists
if 'gender' in heart_failure_records.columns:
    heart_failure_records = pd.get_dummies(heart_failure_records, columns=['gender'], drop_first=True)

# 6. Handle missing values in engineered features
heart_failure_records['ed_wait_time'] = heart_failure_records['ed_wait_time'].fillna(0)

# Inspect the engineered features
print("Engineered Features Sample:")
print(heart_failure_records[['length_of_stay', 'ed_wait_time', 'admission_dayofweek', 'admission_hour', 'prior_admissions']].head())

# ---------------------------
# Define Features and Target Variable
# ---------------------------

# Define a list of candidate numerical features; include 'age' only if it exists
candidate_num_cols = ['age', 'length_of_stay', 'ed_wait_time', 'admission_dayofweek', 'admission_hour', 'prior_admissions']
num_cols = [col for col in candidate_num_cols if col in heart_failure_records.columns]

# After dummy encoding, find all columns with prefixes indicating encoded categorical features.
dummy_features = [col for col in heart_failure_records.columns if
                  col.startswith('gender_') or col.startswith('admission_type_') or
                  col.startswith('admission_location_') or col.startswith('insurance_') or
                  col.startswith('religion_') or col.startswith('marital_status_') or
                  col.startswith('ethnicity_')]

# Combine numerical and dummy features
features = num_cols + dummy_features

# Define the target variable (assuming 'readmission' has been computed earlier)
target = 'readmission'

# Display the features used and a preview of the data with the target variable
print("\nFeatures used:", features)
print("\nData Sample with Features and Target:")
print(heart_failure_records[features + [target]].head())


Engineered Features Sample:
   length_of_stay  ed_wait_time  admission_dayofweek  admission_hour  \
0               8      6.283333                    1              21   
1              12      0.000000                    1               7   
2              10      5.783333                    1              10   
3              12      5.733333                    2              19   
4              12      5.733333                    2              19   

   prior_admissions  
0                 0  
1                 0  
2                 0  
3                 0  
4                 1  

Features used: ['length_of_stay', 'ed_wait_time', 'admission_dayofweek', 'admission_hour', 'prior_admissions', 'admission_type_EMERGENCY', 'admission_type_URGENT', 'admission_location_PHYS REFERRAL/NORMAL DELI', 'admission_location_TRANSFER FROM HOSP/EXTRAM', 'insurance_Medicare', 'insurance_Private', 'religion_CHRISTIAN SCIENTIST', 'religion_JEWISH', 'religion_NOT SPECIFIED', 'religion_PROTESTANT QUAKE

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
heart_failure_records[num_cols] = scaler.fit_transform(heart_failure_records[num_cols])


In [11]:
from sklearn.model_selection import train_test_split

X = heart_failure_records[features]
y = heart_failure_records[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train set size:", X_train.shape, "Test set size:", X_test.shape)


Train set size: (48, 25) Test set size: (13, 25)


In [12]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
!pip install optuna
import optuna
from sklearn.model_selection import cross_val_score

X = heart_failure_records[features]
y = heart_failure_records[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.2),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5)
    }
    model = xgb.XGBClassifier(eval_metric='logloss', random_state=42, **param)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_trial.params
print("Best Hyperparameters:", best_params)
print("Best CV ROC-AUC:", study.best_trial.value)

best_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42, **best_params)
best_model.fit(X_train, y_train)




[I 2025-03-05 17:35:43,360] A new study created in memory with name: no-name-16a48abf-9f46-4c27-a952-3205e530093a


Train size: (48, 25) Test size: (13, 25)


[I 2025-03-05 17:35:46,460] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 122, 'max_depth': 3, 'learning_rate': 0.07427176317568338, 'subsample': 0.8976953662703333, 'colsample_bytree': 0.9986151440418978, 'gamma': 0.05916885138591721, 'min_child_weight': 4}. Best is trial 0 with value: 0.5.
[I 2025-03-05 17:35:46,678] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 114, 'max_depth': 6, 'learning_rate': 0.0193788583319014, 'subsample': 0.8387270715711438, 'colsample_bytree': 0.8066259091375411, 'gamma': 0.08709477830398962, 'min_child_weight': 5}. Best is trial 0 with value: 0.5.
[I 2025-03-05 17:35:46,885] Trial 2 finished with value: 0.6583333333333333 and parameters: {'n_estimators': 153, 'max_depth': 4, 'learning_rate': 0.051115888059043796, 'subsample': 0.8183927966853917, 'colsample_bytree': 0.9957748635718865, 'gamma': 0.08384129945641794, 'min_child_weight': 2}. Best is trial 2 with value: 0.6583333333333333.
[I 2025-03-05 17:35:47,151] Tri

Best Hyperparameters: {'n_estimators': 128, 'max_depth': 4, 'learning_rate': 0.014458963998181132, 'subsample': 0.825657797739609, 'colsample_bytree': 0.8117564356588899, 'gamma': 0.07777583390542606, 'min_child_weight': 1}
Best CV ROC-AUC: 0.8444444444444444


In [13]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, roc_curve
import numpy as np

y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred_default = best_model.predict(X_test)

auc = roc_auc_score(y_test, y_pred_proba)
f1_default = f1_score(y_test, y_pred_default)
cm_default = confusion_matrix(y_test, y_pred_default)

print("XGBoost AUC:", auc)
print("XGBoost F1 Score:", f1_default)
print("Confusion Matrix:\n", cm_default)

# Determine optimal threshold using Youden's J statistic
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
youden_j = tpr - fpr
best_threshold = thresholds[np.argmax(youden_j)]
print("Best threshold based on Youden's J:", best_threshold)

# Apply the tuned threshold
y_pred_tuned = (y_pred_proba >= best_threshold).astype(int)
f1_tuned = f1_score(y_test, y_pred_tuned)
cm_tuned = confusion_matrix(y_test, y_pred_tuned)

print("New XGBoost F1 Score:", f1_tuned)
print("New Confusion Matrix:\n", cm_tuned)


XGBoost AUC: 0.8571428571428572
XGBoost F1 Score: 0.6
Confusion Matrix:
 [[6 1]
 [3 3]]
Best threshold based on Youden's J: 0.31153336
New XGBoost F1 Score: 0.8571428571428571
New Confusion Matrix:
 [[5 2]
 [0 6]]


In [14]:
print("\nFeatures used:", features)



Features used: ['length_of_stay', 'ed_wait_time', 'admission_dayofweek', 'admission_hour', 'prior_admissions', 'admission_type_EMERGENCY', 'admission_type_URGENT', 'admission_location_PHYS REFERRAL/NORMAL DELI', 'admission_location_TRANSFER FROM HOSP/EXTRAM', 'insurance_Medicare', 'insurance_Private', 'religion_CHRISTIAN SCIENTIST', 'religion_JEWISH', 'religion_NOT SPECIFIED', 'religion_PROTESTANT QUAKER', 'religion_ROMANIAN EAST. ORTH', 'religion_UNOBTAINABLE', 'marital_status_MARRIED', 'marital_status_SEPARATED', 'marital_status_SINGLE', 'marital_status_UNKNOWN (DEFAULT)', 'marital_status_WIDOWED', 'ethnicity_HISPANIC OR LATINO', 'ethnicity_UNKNOWN/NOT SPECIFIED', 'ethnicity_WHITE']


Simulate Real-Time Data Feed

In [15]:
# Sort test data by admission time (if available) to simulate chronological order
test_data = heart_failure_records.loc[X_test.index].sort_values(by='admittime')

# Split test data into 5 batches
batches = np.array_split(test_data, 5)
for i, batch in enumerate(batches):
    X_batch = batch[features]
    batch_predictions = best_model.predict_proba(X_batch)[:, 1]
    batch['predicted_risk'] = batch_predictions
    print(f"\nBatch {i+1} Predictions:")
    print(batch[['subject_id', 'admittime', 'predicted_risk']].head())



Batch 1 Predictions:
    subject_id           admittime  predicted_risk
33       40456 2118-10-06 16:25:00        0.164861
55       43735 2128-11-04 16:05:00        0.747637
13       10056 2129-05-02 00:12:00        0.194172

Batch 2 Predictions:
    subject_id           admittime  predicted_risk
12       10056 2129-05-02 00:12:00        0.331982
46       42033 2131-07-26 17:13:00        0.165099
5        10038 2144-02-09 17:53:00        0.557396

Batch 3 Predictions:
    subject_id           admittime  predicted_risk
40       40655 2144-07-18 19:32:00        0.248458
49       42346 2160-12-16 13:47:00        0.752330
31       40304 2163-11-21 18:34:00        0.222353

Batch 4 Predictions:
    subject_id           admittime  predicted_risk
0        10006 2164-10-23 21:09:00        0.622466
59       43827 2176-07-14 13:24:00        0.311533

Batch 5 Predictions:
    subject_id           admittime  predicted_risk
34       40503 2186-07-06 19:59:00        0.450695
16       10083 2192-11-

  return bound(*args, **kwds)


In [16]:
 # After training your model, assign it to 'model'
model = best_model


In [None]:
!pip install pyngrok
from flask import Flask, request, jsonify
import pickle
import numpy as np
from pyngrok import ngrok

app = Flask(__name__)

# Assume 'model' is already trained and available in memory
# If not, load it from a file if you have saved it.

@app.route('/')
def home():
    return "API is running!"

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    features = np.array(data['features']).reshape(1, -1)
    prediction = model.predict(features)[0]
    prediction_proba = model.predict_proba(features)[0, 1]
    return jsonify({'readmission_risk': int(prediction), 'risk_probability': float(prediction_proba)})

# Set your ngrok authtoken (replace with your actual token)
ngrok.set_auth_token("2tuAJ5MSqphd90cT1U9XXBy3ghb_6dQETMfChq2aotjeu3gNC")

# Expose the Flask app via ngrok on port 5000
public_url = ngrok.connect(5000).public_url
print("API public URL:", public_url)

app.run(port=5000, debug=False, use_reloader=False)


API public URL: https://811f-34-125-163-89.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:40:43] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:42:41] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:42:41] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:42:44] "[33mGET /predi HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:43:05] "[31m[1mGET /predict HTTP/1.1[0m" 405 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:43:06] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:46:36] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:47:26] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 17:47:26] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Mar/2025 18:06:04] "GET / HTTP/1.1" 200 -


In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

# Plot feature importance for the best XGBoost model
xgb.plot_importance(best_model, max_num_features=10)
plt.title("Feature Importance - XGBoost")
plt.show()


In [None]:
import shap

# Initialize the SHAP explainer with the best model
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Summary plot: shows feature importance and effects
shap.summary_plot(shap_values, X_test, plot_type="bar")


In [None]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, roc_curve
import numpy as np

y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred_default = best_model.predict(X_test)

auc = roc_auc_score(y_test, y_pred_proba)
f1_default = f1_score(y_test, y_pred_default)
cm_default = confusion_matrix(y_test, y_pred_default)

print("XGBoost AUC:", auc)
print("XGBoost F1 Score:", f1_default)
print("Confusion Matrix:\n", cm_default)

# Determine optimal threshold using Youden's J statistic
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
youden_j = tpr - fpr
best_threshold = thresholds[np.argmax(youden_j)]
print("Best threshold based on Youden's J:", best_threshold)

# Apply the tuned threshold
y_pred_tuned = (y_pred_proba >= best_threshold).astype(int)
f1_tuned = f1_score(y_test, y_pred_tuned)
cm_tuned = confusion_matrix(y_test, y_pred_tuned)

print("New XGBoost F1 Score:", f1_tuned)
print("New Confusion Matrix:\n", cm_tuned)
