# ML Models for Length-of-Stay

In [None]:
import pandas as pd
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

df = spark.sql("SELECT * FROM PT_evals_lakehouse.pt_eval_standardized LIMIT 1000")
df = df.toPandas()

experiment_name = "experiment-pt-need-rehab"
mlflow.set_experiment(experiment_name)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
from sklearn.metrics import mean_absolute_error, r2_score
X, y = .iloc[:,0:-3], .loc[:,'los']

X = standardizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


median_y = np.median(y_train)
mad_y = np.median(np.abs(y_train - median_y))
residual_threshold = 1.4826 * mad_y 

param_grid = {
    'max_trials': [50, 100, 200],
    'min_samples': [0.5, 0.75, 0.95],
    'residual_threshold': [1.0, 1.4826, 2.0, residual_threshold]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=ransac, param_grid=param_grid, cv=5, scoring='r2')

# Fit the grid search
grid_search.fit(X_train, y_train)


best_ransac = grid_search.best_estimator_

# Make predictions with the best estimator
y_pred = best_ransac.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
X, y = .iloc[:,0:-3], .loc[:,'los']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

rfr = RandomForestRegressor(
    n_estimators = 1000,
    random_state = 1,
    n_jobs = -1)
rfr.fit(X_train, y_train)
y_train_pred = rfr.predict(X_train)
y_test_pred = rfr.predict(X_test)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f' MAE train: {mae_train:.2f}')
print(f' MAE test: {mae_test:.2f}')

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
print(f'R^2 train: {r2_train:.2f}')
print(f'R^2 test: {r2_test:.2f}')

In [None]:
param_grid = {
    'n_estimators': [100, 250, 500, 750],
    'max_depth': [None, 3, 4, 5, 8, 10],
    'min_samples_split': [10, 15, 20, 30],
    'min_samples_leaf': [2, 4, 6, 8],
    'max_features' : ['sqrt', 'log2', None]
}

grid_search_rfr = GridSearchCV(estimator=RandomForestRegressor(random_state=1), 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='r2',
                           n_jobs=-1)

# Fit the grid search
grid_search_rfr.fit(X_train, y_train)

# Get the best estimator
best_rfr = grid_search_rfr.best_estimator_

# Predict on training and test sets
y_train_pred = best_rfr.predict(X_train)
y_test_pred = best_rfr.predict(X_test)

# Evaluate the model
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

best_params = grid_search_rfr.best_params_
print("Best Parameters:", best_params)
print(f' MAE train: {mae_train:.2f}')
print(f' MAE test: {mae_test:.2f}')

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
print(f'R^2 train: {r2_train:.2f}')
print(f'R^2 test: {r2_test:.2f}')