In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import os



In [3]:
# Load the metadata
HOME_PATH = 'D:\\Softwarica\\Thesis File\\data\\raw\\'
df_helper = pd.read_csv(HOME_PATH + 'ED_metadata.csv', header=0)

width = df_helper['width'].tolist()
col_names = df_helper['column_name'].tolist()
variable_types = df_helper['variable_type'].tolist()
print(df_helper.head(n=5))


   width column_name  variable_type
0      2      VMONTH    CATEGORICAL
1      1       VDAYR    CATEGORICAL
2      4     ARRTIME  NONPREDICTIVE
3      4    WAITTIME     CONTINUOUS
4      4         LOV  NONPREDICTIVE


In [4]:
# Read the fixed-width formatted file into DataFrame
df_ed = pd.read_fwf(HOME_PATH + 'ED2013', widths=width, header=None, dtype='str')
df_ed.columns = col_names


In [6]:
print(df_ed.head(n=5))

  VMONTH VDAYR ARRTIME WAITTIME   LOV  AGE AGER AGEDAYS RESIDNCE SEX  ...  \
0     01     3    0647     0033  0058  046    4     -07       01   2  ...   
1     01     3    1841     0109  0150  056    4     -07       01   2  ...   
2     01     3    1333     0084  0198  037    3     -07       01   2  ...   
3     01     3    1401     0159  0276  007    1     -07       01   1  ...   
4     01     4    1947     0114  0248  053    4     -07       01   1  ...   

  RX12V3C1 RX12V3C2 RX12V3C3 RX12V3C4 SETTYPE  YEAR   CSTRATM   CPSUM   PATWT  \
0      NaN      NaN      NaN      NaN       3  2013  20113201  100020  002945   
1      NaN      NaN      NaN      NaN       3  2013  20113201  100020  002945   
2      NaN      NaN      NaN      NaN       3  2013  20113201  100020  002945   
3      NaN      NaN      NaN      NaN       3  2013  20113201  100020  002945   
4      NaN      NaN      NaN      NaN       3  2013  20113201  100020  002945   

  EDWT  
0  NaN  
1  NaN  
2  NaN  
3  NaN  
4  Na

In [5]:
print(df_ed.shape)


(24777, 579)


In [6]:
# Convert categorical variables to numerical
le = LabelEncoder()
for col in df_ed.select_dtypes(include=['object']).columns:
    df_ed[col] = le.fit_transform(df_ed[col].astype(str))


In [7]:
# Convert 'TEMPF' column to numeric
df_ed.loc[:,'TEMPF'] = df_ed.loc[:,'TEMPF'].apply(pd.to_numeric, errors='coerce')

# Convert other categorical variables to numerical
le = LabelEncoder()
for col in df_ed.select_dtypes(include=['object']).columns:
    df_ed[col] = le.fit_transform(df_ed[col].astype(str))

# Handle missing values by filling them with the mean of the column
df_ed = df_ed.fillna(df_ed.mean())

# Ensure the response columns are numeric
response_cols = ['MED1', 'MED2', 'MED3', 'MED4', 'MED5', 'MED6', 'MED7', 'MED8', 'MED9', 'MED10', 'MED11', 'MED12']
df_ed.loc[:, response_cols] = df_ed.loc[:, response_cols].apply(pd.to_numeric, errors='coerce')


In [8]:
# Scale the numerical variables
scaler = StandardScaler()
numeric_cols = df_ed.select_dtypes(include=['int64', 'float64']).columns
df_ed[numeric_cols] = scaler.fit_transform(df_ed[numeric_cols])


In [9]:
# Verify the predictor columns exist
predictor_cols = ['AGE', 'SEX', 'ETHUN', 'RACEUN', 'TEMPF', 'PULSE', 'RESPR', 'BPSYS', 'BPDIAS', 'IMMEDR', 'PAINSCALE', 'INJURY', 'CAUSE1', 'DIAG1', 'PRDIAG1', 'CANCER', 'CEBVD', 'COPD', 'EDDIAL', 'CHF', 'DEMENTIA', 'DIABETES', 'MIHX', 'DVT', 'EDHIV', 'ABG', 'BAC', 'BLOODCX', 'BNP', 'BUNCREAT', 'CARDENZ', 'CBC', 'DDIMER', 'ELECTROL', 'GLUCOSE', 'LACTATE', 'LFT', 'PTTINR', 'OTHERBLD', 'CARDMON', 'EKG', 'HIVTEST', 'FLUTEST', 'PREGTEST', 'TOXSCREN', 'URINE', 'WOUNDCX', 'URINECX', 'OTHIMAGE', 'XRAY', 'IVCONTRAST', 'CATSCAN', 'CTAB', 'CTCHEST', 'CTHEAD', 'CTOTHER', 'MRI', 'ULTRASND', 'OTHIMAGE']
print("Predictor columns present:", [col for col in predictor_cols if col in df_ed.columns])

# Filter the DataFrame to only include the existing predictor columns
existing_predictor_cols = [col for col in predictor_cols if col in df_ed.columns]
X = df_ed[existing_predictor_cols]
y = df_ed[response_cols]


Predictor columns present: ['AGE', 'SEX', 'ETHUN', 'RACEUN', 'TEMPF', 'PULSE', 'RESPR', 'BPSYS', 'BPDIAS', 'IMMEDR', 'PAINSCALE', 'INJURY', 'CAUSE1', 'DIAG1', 'PRDIAG1', 'CANCER', 'CEBVD', 'COPD', 'EDDIAL', 'CHF', 'DEMENTIA', 'DIABETES', 'MIHX', 'DVT', 'EDHIV', 'ABG', 'BAC', 'BLOODCX', 'BNP', 'BUNCREAT', 'CARDENZ', 'CBC', 'DDIMER', 'ELECTROL', 'GLUCOSE', 'LACTATE', 'LFT', 'PTTINR', 'OTHERBLD', 'CARDMON', 'EKG', 'HIVTEST', 'FLUTEST', 'PREGTEST', 'TOXSCREN', 'URINE', 'WOUNDCX', 'URINECX', 'OTHIMAGE', 'XRAY', 'IVCONTRAST', 'CATSCAN', 'CTAB', 'CTCHEST', 'CTHEAD', 'CTOTHER', 'MRI', 'ULTRASND', 'OTHIMAGE']


In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

