In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.model_selection import cross_val_score

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


In [3]:
path = '/home/daisy/FDA_Dataset/inpatient_all_final_1.csv'
df1 = pd.read_csv(path).iloc[:,1:]
df1.drop(columns = ['Veteran flag',
                    'Event date','Marital status', 'Marital status encoded',
                    'State','Ruca category'], inplace=True)


path = '/home/daisy/FDA_Dataset/inpatient_all_final_2.csv'
df2 = pd.read_csv(path).iloc[:,1:]
df2.drop(columns = ['Veteran flag',
                    'Event date','Marital status', 'Marital status encoded',
                    'State','Ruca category'], inplace=True)

In [4]:
df1.shape

(84536, 76)

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84536 entries, 0 to 84535
Data columns (total 76 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Internalpatientid             84536 non-null  int64  
 1   num_stays                     84536 non-null  int64  
 2   stay_length                   84536 non-null  float64
 3   num_unique_units              84536 non-null  int64  
 4   num_transfers                 84536 non-null  int64  
 5   num_cvd_readmission           84536 non-null  int64  
 6   Readmission                   84536 non-null  int64  
 7   Died                          84536 non-null  int64  
 8   AO                            84536 non-null  int64  
 9   CVD                           84536 non-null  int64  
 10  unique_admitting_specialty    84536 non-null  int64  
 11  unique_discharging_specialty  84536 non-null  int64  
 12  DOMICILIARY                   84536 non-null  int64  
 13  M

### Train test split


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84536 entries, 0 to 84535
Data columns (total 76 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Internalpatientid             84536 non-null  int64  
 1   num_stays                     84536 non-null  int64  
 2   stay_length                   84536 non-null  float64
 3   num_unique_units              84536 non-null  int64  
 4   num_transfers                 84536 non-null  int64  
 5   num_cvd_readmission           84536 non-null  int64  
 6   Readmission                   84536 non-null  int64  
 7   Died                          84536 non-null  int64  
 8   AO                            84536 non-null  int64  
 9   CVD                           84536 non-null  int64  
 10  unique_admitting_specialty    84536 non-null  int64  
 11  unique_discharging_specialty  84536 non-null  int64  
 12  DOMICILIARY                   84536 non-null  int64  
 13  M

In [7]:
X_admission = df1.drop(columns = ['Readmission'])
Y_admission = df1[['Readmission']]

X_mortality = df1.drop(columns = ['Died'])
Y_mortality = df1[['Died']]

In [8]:
X_train_ad, X_test_ad, y_train_ad, y_test_ad = train_test_split(X_admission, Y_admission, test_size=0.20, random_state=42)
X_train_mor, X_test_mor, y_train_mor, y_test_mor = train_test_split(X_mortality, Y_mortality, test_size=0.20, random_state=42)


#### Filling missing values

In [9]:
df1.columns[df1.isna().any()].tolist()

['total_procedure',
 'num_surgery_pro',
 'num_immunization',
 'Num med per admission mean',
 'Num med per admission min',
 'Num med per admission max',
 'Total medications',
 'mean age at specailty',
 'period mean',
 'period std',
 'specialty medical count',
 'specialty support count',
 'specialty count',
 'Age 20-40 hypotension',
 'Age 40-60 hypotension',
 'Age 60-80 hypotension',
 'Age 80-100 hypotension',
 'Age 100-120 hypotension',
 'Age 20-40 hypertension',
 'Age 40-60 hypertension',
 'Age 60-80 hypertension',
 'Age 80-100 hypertension',
 'Age 100-120 hypertension',
 'Age 20-40 healthy',
 'Age 40-60 healthy',
 'Age 60-80 healthy',
 'Age 80-100 healthy',
 'Age 100-120 healthy',
 'lab_count',
 'lab_freq',
 'lab_age_mean',
 'lab_age_std']

In [10]:
# Filling continuous features with mean values
numeric_cols = df1.columns[df1.isna().any()].tolist()

X_train_ad[numeric_cols] = X_train_ad[numeric_cols].fillna(X_train_ad[numeric_cols].mean())
X_train_mor[numeric_cols] =  X_train_mor[numeric_cols].fillna(X_train_mor[numeric_cols].mean())

X_test_ad[numeric_cols] = X_test_ad[numeric_cols].fillna(X_test_ad[numeric_cols].mean())
X_test_mor[numeric_cols] = X_test_mor[numeric_cols].fillna(X_test_mor[numeric_cols].mean())

In [11]:
X_train_ad.isna().any().sum()

0

#### Descision Tree

In [12]:
lr = DecisionTreeClassifier()
# Training
lr.fit(X_train_ad, y_train_ad)

# Prediction
lr_prediction = lr.predict(X_test_ad)

In [13]:
print(classification_report(y_test_ad, lr_prediction, target_names= ['Not Readmitted', 'Readmitted']))


                precision    recall  f1-score   support

Not Readmitted       1.00      1.00      1.00      3316
    Readmitted       1.00      1.00      1.00     13592

      accuracy                           1.00     16908
     macro avg       1.00      1.00      1.00     16908
  weighted avg       1.00      1.00      1.00     16908



In [14]:
lr = DecisionTreeClassifier()
# Training
lr.fit(X_train_mor, y_train_mor)

# Prediction
lr_prediction = lr.predict(X_test_mor)

In [15]:
print(classification_report(y_test_mor, lr_prediction, target_names= ['Not Died', 'Died']))


              precision    recall  f1-score   support

    Not Died       0.86      0.85      0.86     13744
        Died       0.39      0.42      0.41      3164

    accuracy                           0.77     16908
   macro avg       0.63      0.64      0.63     16908
weighted avg       0.78      0.77      0.77     16908



In [16]:
y_train_ad['Readmission'].unique()

array([1, 0])

In [17]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train_mor,X_train_mor)
result=logit_model.fit()
print(result.summary2()) 

Optimization terminated successfully.
         Current function value: 0.371450
         Iterations 11
                                       Results: Logit
Model:                       Logit                     Pseudo R-squared:          0.224     
Dependent Variable:          Died                      AIC:                       50378.8745
Date:                        2023-07-18 12:41          BIC:                       51008.2771
No. Observations:            67628                     Log-Likelihood:            -25120.   
Df Model:                    68                        LL-Null:                   -32361.   
Df Residuals:                67559                     LLR p-value:               0.0000    
Converged:                   1.0000                    Scale:                     1.0000    
No. Iterations:              11.0000                                                        
--------------------------------------------------------------------------------------------
      