In [1]:
#The problem statement goes like this: 
#Create a machine learning model to predict the likelihood of a patient being admitted to the Emergency Room (ER) 
#within the next "N" days, using historical medical and patient data. 
#Keep the code as generic as possible
#Dependent Variable : er_visit  
#Expected Outcome: 
#1. Machine Learning 
#2. Explanations for predictions 
#3. Engineered Features [novel, creative and  expert level]
#4. Documentation on data preparation and feature engineering steps. Please have the code snippet description in detail  followed by the code snippet.
#5. Methodology
# The data set used here is er_model_data.csv.

In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [22]:
# Step 1: Data Preparation and Exploration
# Load the dataset
data = pd.read_csv('er_model_data.csv')

In [23]:
# Explore the data
print(data.head())
print(data.info())



   Unnamed: 0  patient_nbr             race  gender      age weight  \
0           0      8222157        Caucasian  Female   [0-10)    NaN   
1           1     55629189        Caucasian  Female  [10-20)    NaN   
2           2     86047875  AfricanAmerican  Female  [20-30)    NaN   
3           3     82442376        Caucasian    Male  [30-40)    NaN   
4           4     42519267        Caucasian    Male  [40-50)    NaN   

   time_in_hospital payer_code         medical_specialty  num_lab_procedures  \
0                 1        NaN  Pediatrics-Endocrinology                  41   
1                 3        NaN                       NaN                  59   
2                 2        NaN                       NaN                  11   
3                 2        NaN                       NaN                  44   
4                 1        NaN                       NaN                  51   

   ...  examide  citoglipton  insulin  glyburide-metformin  \
0  ...       No           No  

In [13]:
# Handle missing values and data preprocessing
# For simplicity, let's fill missing values in numeric columns with their means
numeric_cols = data.select_dtypes(include=['number']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())


In [25]:
# Handle missing values and data preprocessing
# For columns with missing values, fill with appropriate values (e.g., for categorical, use mode)
data['race'].fillna(data['race'].mode()[0], inplace=True)
data['weight'].fillna('Unknown', inplace=True)
data['payer_code'].fillna(data['payer_code'].mode()[0], inplace=True)
data['medical_specialty'].fillna('Unknown', inplace=True)
data.fillna('Unknown', inplace=True)  # Fill remaining missing values with 'Unknown'


In [26]:
# Encode categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [27]:
# Step 2: Feature Engineering
# No feature engineering in this example, but you can add custom features based on domain knowledge

# Step 3: Machine Learning Model Building
# Define the target variable and features
X = data.drop('er_visit', axis=1)
y = data['er_visit']

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [30]:
# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [31]:
# Make predictions
y_pred = clf.predict(X_test)


In [32]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [33]:
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.7953719239373602
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78      6778
           1       0.79      0.83      0.81      7526

    accuracy                           0.80     14304
   macro avg       0.80      0.79      0.79     14304
weighted avg       0.80      0.80      0.79     14304



In [34]:
# Step 4: Feature Importance
# Access feature importances from the trained model
feature_importances = clf.feature_importances_


In [35]:
# Print feature importance scores for each feature
for feature, importance in zip(X.columns, feature_importances):
    print(f'{feature}: {importance}')


Unnamed: 0: 0.09137537926171745
patient_nbr: 0.08793565037641944
race: 0.0161784585012174
gender: 0.011191109513464215
age: 0.03180819101344124
weight: 0.006596718953585204
time_in_hospital: 0.041648358146953786
payer_code: 0.023313229851581532
medical_specialty: 0.09191585469889559
num_lab_procedures: 0.1109553492450161
num_procedures: 0.06840349501166837
num_medications: 0.060459173018790544
number_outpatient: 0.012150871074280744
number_emergency: 0.007831018168734556
number_inpatient: 0.016964740512447948
diag_1: 0.07824745860543028
diag_2: 0.05467120114847738
diag_3: 0.05209295083805356
number_diagnoses: 0.03021955921202175
max_glu_serum: 0.017181561286493056
A1Cresult: 0.014250894802896773
metformin: 0.009987050861105708
repaglinide: 0.0018926877799047108
nateglinide: 0.0010045224161878057
chlorpropamide: 0.00011129352383393874
glimepiride: 0.005176451659425608
acetohexamide: 0.0
glipizide: 0.008557619624711755
glyburide: 0.007858637506543
tolbutamide: 7.622748751512195e-05
piogl