# Classic Machine Learning Models
This file contains code to run classic machine learning models (Linear Regression, Random Forest, Support Vector Regression, Gradient Boosting, Bayesian Ridge, and Guassian Process Regressor) on the data. However, only seven columns with a small number of unique values were used to maximize simplicity for the models. These columns are  "age_group", "gender", "type_of_admission","apr_severity_of_illness_description" "apr_risk_of_mortality", "apr_medical_surgical_description", "emergency_department_indicator"

All of the columns were categorical and one hot encoded. A summary was printed showing the metrics of Mean Squared Error and R Squared score.

The main purpose of the file is to see the results of the classic ML models on the data to set a basline for future models

In [None]:
# Load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import BayesianRidge
from sklearn.gaussian_process import GaussianProcessRegressor

In [None]:
# Read in the dataset
df = pd.read_csv("hospital_cleaned.csv")

In [None]:
# Define feature columns and target column
features = [
    "age_group",
    "gender",
    "type_of_admission",
    "apr_severity_of_illness_description",
    "apr_risk_of_mortality",
    "apr_medical_surgical_description",
    "emergency_department_indicator"
]
target = "length_of_stay"
df_model = df[features + [target]].dropna()

In [None]:
# 1. Split
X_train, X_test, y_train, y_test = train_test_split(
    df_model[features], df_model[target],
    test_size=0.2, random_state=42
)

In [None]:
# 2. One‐hot encode all categorical cols
preprocessor = ColumnTransformer([
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), features)
])

In [None]:
# 3. Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest":     RandomForestRegressor(100, random_state=42),
    "SVR":              SVR(),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "BayesianRidge":    BayesianRidge(),
    "GPR":              GaussianProcessRegressor()
}

In [None]:
# 4. Fit & evaluate
results = []
for name, model in models.items():
    pipe = Pipeline([
        ("preproc", preprocessor),
        ("reg",    model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2  = r2_score(y_test, preds)
    results.append((name, mse, r2))
    print(f"{name:17s} → MSE: {mse:.3f},  R²: {r2:.3f}")

In [None]:
# 5. Summarize results
res_df = pd.DataFrame(results, columns=["model","MSE","R2"]).set_index("model")
print("\nSummary:\n", res_df)