## Importing Libs

In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler

import joblib
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

## Loading Primary Dataset

In [2]:
admission_df = pd.read_csv('assets/admissions_202208161605.csv')

admission_df['marital_status'] = admission_df['marital_status'].apply(lambda x: 'MARRIED' if pd.isna(x) else x)

admission_df['edregtime'] = pd.to_datetime(admission_df['edregtime'])
admission_df['edouttime'] = pd.to_datetime(admission_df['edouttime'])
admission_df['emergency_time'] = (admission_df['edouttime'] - admission_df['edregtime']).dt.total_seconds() / 60
admission_df['emergency_time'] = admission_df['emergency_time'].apply(lambda x: 0 if pd.isna(x) else x)

admission_df['admittime'] = pd.to_datetime(admission_df['admittime'])
admission_df['dischtime'] = pd.to_datetime(admission_df['dischtime'])

admission_df.sort_values(by=['subject_id', 'admittime'], inplace=True)

admission_df['next_admittime'] = admission_df.groupby('subject_id')['admittime'].shift(-1)
admission_df['next_dischtime'] = admission_df.groupby('subject_id')['dischtime'].shift(-1)

admission_df['time_diff'] = (admission_df['next_admittime'] - admission_df['dischtime']).dt.total_seconds() / 86400

admission_df['time_diff_flag'] = admission_df['time_diff'].apply(lambda x: 1 if pd.notna(x) and x <= 30 else 0)

In [3]:
final_admission_df = admission_df[[
    'subject_id',  
    'hadm_id',
    'admittime',
    'religion', 
    'marital_status', 
    'ethnicity', 
    'emergency_time',
    'time_diff_flag'
]]

# Display the final DataFrame
final_admission_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,religion,marital_status,ethnicity,emergency_time,time_diff_flag
212,2,163353,2138-07-17 19:04:00,NOT SPECIFIED,MARRIED,ASIAN,0.0,0
213,3,145834,2101-10-20 19:08:00,CATHOLIC,MARRIED,WHITE,135.0,0
214,4,185777,2191-03-16 00:28:00,PROTESTANT QUAKER,SINGLE,WHITE,720.0,0
215,5,178980,2103-02-02 04:31:00,BUDDHIST,MARRIED,ASIAN,0.0,0
216,6,107064,2175-05-30 07:15:00,NOT SPECIFIED,MARRIED,WHITE,0.0,0


## Joining with Additional Columns

In [4]:
procedures_df = pd.read_csv('assets/procedures_icd_202208161605.csv')

filtered_procedures_df = procedures_df[procedures_df['icd9_code'].isin([39891,40201,40211,40291,40401,40403,40411,40413,40491,40493,4280,4281,42820,42821,42822,42823,42830,42831,42832,42833,42840,42841,42842,42843,4289])]
filtered_procedures_df
hadm_ids = filtered_procedures_df['hadm_id'].unique()

final_admission_df = final_admission_df[final_admission_df['hadm_id'].isin(hadm_ids)]

In [5]:
patients_df = pd.read_csv('assets/patients_202208161605.csv')

patients_df['dob'] = pd.to_datetime(patients_df['dob'])
patients_df = patients_df[['subject_id','gender','dob']]

final_admission_df = pd.merge(final_admission_df, patients_df, on='subject_id', how='left')

final_admission_df['age'] = (final_admission_df['admittime'].dt.year - final_admission_df['dob'].dt.year)

In [6]:
# drugs_df = pd.read_csv('assets/drgcodes_202208161605.csv')

# drugs_df = drugs_df[['hadm_id','drg_type','drg_code','drg_severity']]
# drugs_df['drg_severity'] = drugs_df['drg_severity'].apply(lambda x: 0 if pd.isna(x) else x)

# final_admission_df = pd.merge(final_admission_df, drugs_df, on='hadm_id', how='left')
# final_admission_df.sort_values(by=['hadm_id'], inplace=True)

# final_admission_df

In [7]:
final_admission_df.drop(columns=['subject_id', 'hadm_id', 'admittime', 'dob'], inplace=True)
final_admission_df.head()

Unnamed: 0,religion,marital_status,ethnicity,emergency_time,time_diff_flag,gender,age
0,NOT SPECIFIED,SINGLE,WHITE,38.0,0,M,44
1,BUDDHIST,MARRIED,ASIAN,0.0,0,M,66
2,UNOBTAINABLE,MARRIED,UNKNOWN/NOT SPECIFIED,0.0,0,F,300
3,UNOBTAINABLE,MARRIED,WHITE,80.0,0,F,74
4,CATHOLIC,SINGLE,WHITE,0.0,0,M,58


## Generating Report

In [8]:
profile = ProfileReport(final_admission_df, title="Admission Data Profiling Report", explorative=True)

profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## Data Transformation

In [9]:
scaler = MinMaxScaler()
final_admission_df['emergency_time'] = scaler.fit_transform(final_admission_df[['emergency_time']])
final_admission_df['age'] = scaler.fit_transform(final_admission_df[['age']])

final_admission_df = pd.get_dummies(final_admission_df, columns=['religion', 'marital_status', 'ethnicity', 'gender'])
final_admission_df.head()

Unnamed: 0,emergency_time,time_diff_flag,age,religion_BUDDHIST,religion_CATHOLIC,religion_EPISCOPALIAN,religion_GREEK ORTHODOX,religion_JEWISH,religion_NOT SPECIFIED,religion_OTHER,...,marital_status_WIDOWED,ethnicity_ASIAN,ethnicity_ASIAN - CHINESE,ethnicity_BLACK/AFRICAN AMERICAN,ethnicity_HISPANIC OR LATINO,ethnicity_UNABLE TO OBTAIN,ethnicity_UNKNOWN/NOT SPECIFIED,ethnicity_WHITE,gender_F,gender_M
0,0.075397,0,0.075812,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,True
1,0.0,0,0.155235,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
2,0.0,0,1.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
3,0.15873,0,0.184116,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
4,0.0,0,0.126354,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True


## Training and Saving the Model

In [10]:
X = final_admission_df.drop('time_diff_flag', axis=1)
y = final_admission_df['time_diff_flag']

clf = RandomForestClassifier(random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')

print("Mean Accuracy:", cv_scores.mean())
print("Standard Deviation of Accuracy:", cv_scores.std())

# Train the model on the entire dataset
clf.fit(X, y)

# Save the model to a file
model_filename = 'model.joblib'
joblib.dump(clf, model_filename)

Mean Accuracy: 0.8555555555555555
Standard Deviation of Accuracy: 0.08810417515085392


['model.joblib']

## Predicting from User Input

In [11]:
user_input = {
    'religion': 'BUDDHIST',
    'marital_status': 'MARRIED',
    'ethnicity': 'ASIAN',
    'emergency_time': 0.0,
    'gender': 'M',
    'age': 66
}

user_df = pd.DataFrame([user_input])

In [12]:
scaler = MinMaxScaler()
user_df['emergency_time'] = scaler.fit_transform(user_df[['emergency_time']])
user_df['age'] = scaler.fit_transform(user_df[['age']])

user_df = pd.get_dummies(
    user_df, columns=['religion', 'marital_status', 'ethnicity', 'gender'])

required_columns = ['emergency_time', 'age', 'religion_BUDDHIST',
                    'religion_CATHOLIC', 'religion_EPISCOPALIAN', 'religion_GREEK ORTHODOX',
                    'religion_JEWISH', 'religion_NOT SPECIFIED', 'religion_OTHER',
                    'religion_PROTESTANT QUAKER', 'religion_UNOBTAINABLE',
                    'marital_status_DIVORCED', 'marital_status_MARRIED',
                    'marital_status_SINGLE', 'marital_status_WIDOWED', 'ethnicity_ASIAN',
                    'ethnicity_ASIAN - CHINESE', 'ethnicity_BLACK/AFRICAN AMERICAN',
                    'ethnicity_HISPANIC OR LATINO', 'ethnicity_UNABLE TO OBTAIN',
                    'ethnicity_UNKNOWN/NOT SPECIFIED', 'ethnicity_WHITE', 'gender_F',
                    'gender_M']


for col in required_columns:
    if col not in user_df.columns:
        user_df[col] = False

user_df = user_df[required_columns]

In [13]:
loaded_model = joblib.load(model_filename)

sample_input = user_df.iloc[0].values.reshape(1, -1)

sample_input
prediction = loaded_model.predict(sample_input)
print("Prediction for the sample input:", prediction[0])

Prediction for the sample input: 0


