In [1]:
import torch
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("CUDA is available. Using GPU.")
else:
    device = torch.device('cpu')
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


In [3]:
all_data = pd.read_csv('all_data_final.csv', index_col=0)

## Preprocessing

In [4]:
all_data['HOSP_DISCH_TIME'] = pd.to_datetime(all_data['HOSP_DISCH_TIME'])
all_data['HOSP_ADMSN_TIME'] = pd.to_datetime(all_data['HOSP_ADMSN_TIME'])
all_data['TIME_IN_HOSP'] = (all_data['HOSP_DISCH_TIME'] - all_data['HOSP_ADMSN_TIME']).dt.total_seconds() / 3600
all_data['MORTALITY'] = all_data['DISCH_DISP'].isin(['Expired', 'Coroner']).map({True: 'yes', False: 'no'})
all_data.rename(columns={'BIRTH_DATE':'AGE',
                         'HOURS':'HOURS_MAP_FLAG',
                         'Element_abbr':'POST_OP_COMPLICATION'}
                , inplace=True)
all_data

Unnamed: 0,MRN,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,HOURS_MAP_FLAG,LOG_ID,ICU_ADMIN_FLAG,SURGERY_DATE,DISCH_DISP_C,DISCH_DISP,AGE,SEX,PRIMARY_PROCEDURE_NM,ASA_RATING_C,ASA_RATING,diagnosis_code,dx_name,POST_OP_COMPLICATION,TIME_IN_HOSP,MORTALITY
0,00018b4d9acb258b,2020-07-25 13:33:00,2020-08-25 16:15:00,0.000000,189a1d9df810fe26,Yes,8/3/20 0:00,6.0,Skilled Nursing Facility,80,Male,"BIOPSY, MUSCLE",4.0,Incapacitating Disease,307.9|nan|786.05|786.09|790.4,"Agitation|Impaired functional mobility, balanc...",AN Post-op Complications,746.700000,no
1,000db5bb4b440912,2021-10-21 07:53:00,2021-10-28 15:33:00,0.000000,21d28a9add40b0c4,Yes,10/23/21 0:00,20.0,Home Healthcare IP Admit Related,77,Male,CABG (CORONARY ARTERY BYPASS GRAFT),4.0,Incapacitating Disease,794.39,Abnormal stress test,AN Post-op Complications,175.666667,no
2,000f8e3ae0b5aa91,2020-05-27 05:58:00,2020-05-27 16:00:00,0.000000,80afb4c2735257cd,Yes,5/27/20 0:00,3.0,Expired,36,Male,"LAPAROSCOPY, EXPLORATORY",5.0,Moribund,289.59,Closed splenic rupture,AN Post-op Complications,10.033333,yes
3,001353f9e953c8cd,2020-09-18 09:20:00,2020-09-25 16:29:00,0.000000,fc47d3637476c87e,Yes,9/20/20 0:00,15.0,Home Routine,78,Female,CABG (CORONARY ARTERY BYPASS GRAFT),4.0,Incapacitating Disease,794.39|414.00,Abnormal stress echo|Coronary artery disease,AN Post-op Complications,175.150000,no
4,0043f49c66d1a29a,2020-04-12 11:04:00,2020-05-01 15:11:00,0.000000,558d26139a0cdfc0,Yes,4/22/20 0:00,15.0,Home Routine,64,Male,"HIPEC - CHEMOTHERAPY, INTRAPERITONEAL, HYPERTH...",3.0,Severe Systemic Disease,,"Malignant neoplasm of stomach, unspecified loc...",AN Post-op Complications,460.116667,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154,ff6f438387a2f822,2018-11-06 15:54:00,2018-11-22 15:40:00,0.535438,2158919443f9f462,Yes,11/13/18 0:00,15.0,Home Routine,60,Male,"REPLACEMENT, AORTIC VALVE",4.0,Incapacitating Disease,428.0|428.9,Acute on chronic congestive heart failure (CMS...,AN Post-op Complications,383.766667,no
2155,ffa83cc8c6472255,2022-06-25 09:44:00,2022-07-28 23:35:00,0.000000,73be2a55e2002335,Yes,6/26/22 0:00,30.0,Long Term Care Facility,57,Male,"CRANIOTOMY, FOR ANEURYSM OR AVM CLIPPING, WITH...",3.0,Severe Systemic Disease,435.9|437.3|nan,Cerebral vasospasm|Aneurysm of middle cerebral...,AN Post-op Complications,805.850000,no
2156,ffd4085849ae2443,2019-04-30 16:17:00,2019-05-11 16:44:00,0.000000,5f6e3e156c0aa0d3,Yes,5/8/19 0:00,6.0,Skilled Nursing Facility,63,Male,"ENDARTERECTOMY, CAROTID",3.0,Severe Systemic Disease,437.0|434.91|272.4|433.10|443.9|444.22|790.29|...,Intracranial atherosclerosis|Acute ischemic ri...,AN Post-op Complications,264.450000,no
2157,ffe4b7d7fa95554d,2021-06-10 05:11:00,2021-07-15 20:40:00,0.366460,e8a53b58e0a341a1,Yes,6/10/21 0:00,3.0,Expired,85,Female,HEMIGLOSSECTOMY,3.0,Severe Systemic Disease,427.31|nan|511.9|518.81|276.4|141.9|284.19|584.9,Paroxysmal atrial fibrillation (CMS-HCC)|Acute...,AN Post-op Complications,855.483333,yes


In [17]:
all_data['DISCH_DISP'].value_counts()

DISCH_DISP
Home Routine                                 481
Home Healthcare IP Admit Related             448
Skilled Nursing Facility                     348
Expired                                      254
Hospice Facility                             178
Long Term Care Facility                      110
Acute Care Facility (not this hospital)       99
Rehab Facility (not this hospital)            68
Rehab Facility (this hospital)                58
Sub-Acute Care Facility                       38
Hospice Home                                  25
Federal Hospital                              15
Against Medical Advice                         7
Board and Care                                 5
Home Healthcare Outside 3 Days                 4
Other Healthcare Not Defined in this List      4
Recuperative Care                              3
Jail/Prison                                    3
Psychiatric Facility (this hospital)           3
Acute Care Facility (this hospital)            3
Psychiatr

In [15]:
all_data['Element_abbr'].value_counts(dropna=False)

Element_abbr
AN Post-op Complications                                              1967
AN Post-op Complications|Cardiovascular                                 38
AN Post-op Complications|Other                                          37
AN Post-op Complications|Respiratory                                    21
Cardiovascular|AN Post-op Complications                                 14
AN Post-op Complications|Airway                                          8
AN Post-op Complications|Metabolic                                       7
AN Post-op Complications|Neurological                                    6
Respiratory|AN Post-op Complications                                     5
AN Post-op Complications|Cardiovascular|Respiratory                      5
Airway|AN Post-op Complications                                          5
AN Post-op Complications|Medication                                      4
Other|AN Post-op Complications                                           4
AN Post-op C

In [18]:
all_data['ASA_RATING'].value_counts()

ASA_RATING
Incapacitating Disease                            1007
Severe Systemic Disease                            613
Moribund                                           147
Mild Systemic Disease                               51
Brain Dead                                           9
Healthy                                              4
Severe Systemic Disease|Incapacitating Disease       2
Incapacitating Disease|Moribund                      1
Mild Systemic Disease|Incapacitating Disease         1
Name: count, dtype: int64

In [29]:
all_data['dx_name'].value_counts()

dx_name
CAD (coronary artery disease)|Coronary artery disease, angina presence unspecified, unspecified vessel or lesion type, unspecified whether native or transplanted heart                      16
Coronary artery disease                                                                                                                                                                      16
NSTEMI (non-ST elevated myocardial infarction) (CMS-HCC)                                                                                                                                     15
Coronary artery disease, angina presence unspecified, unspecified vessel or lesion type, unspecified whether native or transplanted heart|CAD (coronary artery disease)                      14
ESRD (end stage renal disease) (CMS-HCC)                                                                                                                                                      7
                                

In [5]:
le = LabelEncoder()
all_data['SEX'] = le.fit_transform(all_data['SEX'])
all_data['POST_OP_COMPLICATION'] = le.fit_transform(all_data['POST_OP_COMPLICATION'])
all_data['MORTALITY'] = le.fit_transform(all_data['MORTALITY'])
all_data

Unnamed: 0,MRN,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,HOURS_MAP_FLAG,LOG_ID,ICU_ADMIN_FLAG,SURGERY_DATE,DISCH_DISP_C,DISCH_DISP,AGE,SEX,PRIMARY_PROCEDURE_NM,ASA_RATING_C,ASA_RATING,diagnosis_code,dx_name,POST_OP_COMPLICATION,TIME_IN_HOSP,MORTALITY
0,00018b4d9acb258b,2020-07-25 13:33:00,2020-08-25 16:15:00,0.000000,189a1d9df810fe26,Yes,8/3/20 0:00,6.0,Skilled Nursing Facility,80,1,"BIOPSY, MUSCLE",4.0,Incapacitating Disease,307.9|nan|786.05|786.09|790.4,"Agitation|Impaired functional mobility, balanc...",0,746.700000,0
1,000db5bb4b440912,2021-10-21 07:53:00,2021-10-28 15:33:00,0.000000,21d28a9add40b0c4,Yes,10/23/21 0:00,20.0,Home Healthcare IP Admit Related,77,1,CABG (CORONARY ARTERY BYPASS GRAFT),4.0,Incapacitating Disease,794.39,Abnormal stress test,0,175.666667,0
2,000f8e3ae0b5aa91,2020-05-27 05:58:00,2020-05-27 16:00:00,0.000000,80afb4c2735257cd,Yes,5/27/20 0:00,3.0,Expired,36,1,"LAPAROSCOPY, EXPLORATORY",5.0,Moribund,289.59,Closed splenic rupture,0,10.033333,1
3,001353f9e953c8cd,2020-09-18 09:20:00,2020-09-25 16:29:00,0.000000,fc47d3637476c87e,Yes,9/20/20 0:00,15.0,Home Routine,78,0,CABG (CORONARY ARTERY BYPASS GRAFT),4.0,Incapacitating Disease,794.39|414.00,Abnormal stress echo|Coronary artery disease,0,175.150000,0
4,0043f49c66d1a29a,2020-04-12 11:04:00,2020-05-01 15:11:00,0.000000,558d26139a0cdfc0,Yes,4/22/20 0:00,15.0,Home Routine,64,1,"HIPEC - CHEMOTHERAPY, INTRAPERITONEAL, HYPERTH...",3.0,Severe Systemic Disease,,"Malignant neoplasm of stomach, unspecified loc...",0,460.116667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154,ff6f438387a2f822,2018-11-06 15:54:00,2018-11-22 15:40:00,0.535438,2158919443f9f462,Yes,11/13/18 0:00,15.0,Home Routine,60,1,"REPLACEMENT, AORTIC VALVE",4.0,Incapacitating Disease,428.0|428.9,Acute on chronic congestive heart failure (CMS...,0,383.766667,0
2155,ffa83cc8c6472255,2022-06-25 09:44:00,2022-07-28 23:35:00,0.000000,73be2a55e2002335,Yes,6/26/22 0:00,30.0,Long Term Care Facility,57,1,"CRANIOTOMY, FOR ANEURYSM OR AVM CLIPPING, WITH...",3.0,Severe Systemic Disease,435.9|437.3|nan,Cerebral vasospasm|Aneurysm of middle cerebral...,0,805.850000,0
2156,ffd4085849ae2443,2019-04-30 16:17:00,2019-05-11 16:44:00,0.000000,5f6e3e156c0aa0d3,Yes,5/8/19 0:00,6.0,Skilled Nursing Facility,63,1,"ENDARTERECTOMY, CAROTID",3.0,Severe Systemic Disease,437.0|434.91|272.4|433.10|443.9|444.22|790.29|...,Intracranial atherosclerosis|Acute ischemic ri...,0,264.450000,0
2157,ffe4b7d7fa95554d,2021-06-10 05:11:00,2021-07-15 20:40:00,0.366460,e8a53b58e0a341a1,Yes,6/10/21 0:00,3.0,Expired,85,0,HEMIGLOSSECTOMY,3.0,Severe Systemic Disease,427.31|nan|511.9|518.81|276.4|141.9|284.19|584.9,Paroxysmal atrial fibrillation (CMS-HCC)|Acute...,0,855.483333,1


## Model

In [6]:
def split_and_average(value):
    if isinstance(value, str):
        float_list = list(map(float, value.split('|'))) 
        return sum(float_list) / len(float_list) 
    elif pd.isnull(value):
        return value
    else:
        return value 
    
def filter_rows(value):
    if isinstance(value, str):  # Check if the value is a string
        num_list = value.split('|')
        # Only keep the row if the num_list has exactly 1 entry
        return len(num_list) == 1
    return True

In [10]:
predictors = ['HOURS_MAP_FLAG', 'DISCH_DISP_C', 'AGE', 'SEX', 'ASA_RATING_C', 'TIME_IN_HOSP']
outcomes = ['POST_OP_COMPLICATION', 'MORTALITY']
X = all_data[predictors]
X['DISCH_DISP_C'] = X['DISCH_DISP_C'].apply(split_and_average)
X['ASA_RATING_C'] = X['ASA_RATING_C'].apply(split_and_average)
X['AGE'] = X['AGE'].apply(split_and_average)
#X.applymap(split_and_average)
y = all_data[outcomes]
#y.applymap(split_and_average)

X_clean = X.dropna()

# Similarly, drop the corresponding rows in the outcome (y) if necessary
y_clean = y.loc[X_clean.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['DISCH_DISP_C'] = X['DISCH_DISP_C'].apply(split_and_average)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ASA_RATING_C'] = X['ASA_RATING_C'].apply(split_and_average)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['AGE'] = X['AGE'].apply(split_and_average)


In [27]:
# Lasso regression
lasso = Lasso()
param_grid = {'alpha': np.logspace(-4, 0, 50)}

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Grid search
grid_search = GridSearchCV(lasso, param_grid, cv=kf, scoring='neg_mean_squared_error')

grid_search.fit(X_clean, y_clean)
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_clean)
mse = mean_squared_error(y_clean, predictions)
print(f"Mean Squared Error: {mse}\n")

for i, output in enumerate(y_clean.columns):
    print(f"Coefficients for {output}:")
    for feature, coef in zip(X_clean.columns, best_model.coef_[i]):
        print(f"{feature}: {coef}")
    print('\n')

Mean Squared Error: 21.124378622943556

Coefficients for POST_OP_COMPLICATION:
HOURS_MAP_FLAG: 0.0
DISCH_DISP_C: -0.005744473764814079
AGE: -0.018055655252363655
SEX: 0.0
ASA_RATING_C: -0.0
TIME_IN_HOSP: 4.821423654231934e-06


Coefficients for MORTALITY:
HOURS_MAP_FLAG: -0.0
DISCH_DISP_C: -0.003928903183292066
AGE: 0.0
SEX: -0.0
ASA_RATING_C: 0.0
TIME_IN_HOSP: -1.9987232047582037e-05




In [22]:
X_clean.columns

Index(['HOURS_MAP_FLAG', 'DISCH_DISP_C', 'AGE', 'SEX', 'ASA_RATING_C',
       'TIME_IN_HOSP'],
      dtype='object')