# XG Boost Model
This file contains code to run a XG Boost model on the dataset. 

All columns were used and the columns that were categorical were one hot encoded. 

The main purpose of the file is to see how another type of model works on the dataset to confirm that the third neural network produced the best results. 

In [None]:
# Load the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load the dataset
df = pd.read_csv("hospital_los_21.csv")

In [None]:
# Define features and target columns
features = [
    "age_group", 
    "gender",
    "type_of_admission", 
    "ccsr_diagnosis_description",
    "ccsr_procedure_description",
    "apr_drg_description",
    "apr_severity_of_illness_description",
    "apr_risk_of_mortality",
    "apr_medical_surgical_description",
    "emergency_department_indicator",
    "apr_mdc_description"
]
target = "length_of_stay"
X = df[features]
y = df[target]

In [None]:
# One hot encode the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), features)
    ],
    remainder='drop'
)
X_encoded = preprocessor.fit_transform(X)

In [None]:
# Split the training and testing data and train the model
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    early_stopping_rounds=10,   
    eval_metric='rmse'
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)