# MLFlow Logistic Regression Model - need for rehabilitation

In [15]:
import pandas as pd
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

df = spark.sql("SELECT * FROM PT_evals_lakehouse.pt_eval_generated LIMIT 1000")
df = df.toPandas()

experiment_name = "experiment-pt-need-for-rehab"
mlflow.set_experiment(experiment_name)

mlflow.autolog(log_models=False)

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 17, Finished, Available, Finished)

2024/08/08 16:40:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


### Logistic Regression
We will now compare Logistic Regression to our Random Forest model. I will continue to use the reduced DataFrame as it contains the most meaningful data. 

I will also switch over to the sci-kit learn library to perform this analysis. 

In order to accomplish logistic regression, we will need to encode our:
* nominal features(sex, fusion, provider, prior_loc, ad), 
* ordinal features (brace, pain, sup_sit, sit_stand, amb_assist, stairs_assist) \


and standardize the rest of the numerical features (age, ste, hr, num_falls, amb_distance, num_stairs).

In [16]:
## Ordinal feature encoding of pain into two groups: mild+moderate and severe
df['pain_severe'] = df['pain'].apply(lambda x: 1 if x == 'severe' else 0)
df['pain_min_mod'] = df['pain_severe'].apply(lambda x: 0 if x == 1 else 1)
del df['pain']

## Ordinal brace encoding
df['brace_none'] = df['brace'].apply(lambda x: 1 if x =='none' else 0)
df['brace_lso'] = df['brace'].apply(lambda x: 1 if x =='lso' else 0)
df['brace_tlso'] = df['brace'].apply(lambda x: 1 if x =='tlso' else 0)
del df['brace']

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 18, Finished, Available, Finished)

### Mobility

As it stands, there is a significant amount of colinear data contained in sup_sit, sit_stand, amb_assist, and stairs_assist. An individual will not have ambulated if they did not stand. They will not stand if they have not performed supine to sit. It may be best to converge supine to sit and sit to stand into a single binary column which demonstrates whether the patient was able to stand at all. Amb_assist and stairs_assist may also be excluded as we might surmise that if the patient performed a long ambulation or performed several steps, they did not require as much assistance. 

In [17]:
df['stand'] = df['sit_stand'].apply(lambda x: 1 if x in (0,1,2,3) else 0)
del df['sup_sit']
del df['sit_stand']
del df['stairs_assist']
del df['amb_assist']

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 19, Finished, Available, Finished)

In [18]:
## Nominal encoding of sex, fusion, provider, prior_loc, ad
prior_loc_dict = {
    '1sh': 0,
    '2sh': 1,
    'apt': 2,
    'ilf': 3,
    'alf': 4,
    'snf': 5,
    'ipr': 6,
    'ltc': 7
}

ad_dict = {
    'none': 0,
    'cane': 1,
    'walker': 2
}

inverse_prior_loc_mapping = {v: k for k,v in prior_loc_dict.items()}
inverse_ad_mapping = {v: k for k,v in ad_dict.items()}

df['prior_loc'] = df['prior_loc'].map(inverse_prior_loc_mapping)
df['ad'] = df['ad'].map(inverse_ad_mapping)


col_encoded = pd.get_dummies(df[['sex', 'fusion', 'provider', 'prior_loc', 'ad']], dtype = 'int64')

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 20, Finished, Available, Finished)

In [19]:
df = pd.concat([df, col_encoded], ignore_index = False, axis = 1)

del df['sex']
del df['fusion']
del df['provider']
del df['prior_loc']
del df['ad']
del df['Prior_location']
del df['tx_prior_loc']
del df['tx_need_rehab']

## Reorganized columns so that labels are at the end for convenience
df = (
    df[
    [col for col in df.columns if col not in ['los', 'dc_loc', 'need_rehab']]
    + ['los'] + ['dc_loc'] + ['need_rehab']]
)

# IMPORTANT - This was used to create the DataFrame to be used in regressions to predict Length-of-Stay
# spark_df = spark.createDataFrame(df)
# spark_df.write.mode('append').saveAsTable('pt_eval_standardized')

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 21, Finished, Available, Finished)

### Standardization

I will split the data into training and testing samples which are stratified on the need_rehab label. Considering the dataset is relatively small and may contain significant outliers, we will use the RobustScaler class from Scikit-Learn to standardize our data.

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

X, y = df.iloc[:,0:-3], df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

standardizer = RobustScaler()

X_train_std = standardizer.fit_transform(X_train)
X_test_std = standardizer.fit_transform(X_test)

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 22, Finished, Available, Finished)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'penalty': ['l2']
}
log_reg = LogisticRegression(solver='newton-cholesky', multi_class = 'ovr' )
gridsearch_lr = GridSearchCV(estimator = log_reg, param_grid=param_grid, cv=5, scoring='accuracy')

gridsearch_lr.fit(X_train_std, y_train)

input_schema = Schema([
    ColSpec("integer", "age"),
    ColSpec("integer", "ste"),
    ColSpec("integer", "hr"),
    ColSpec("integer", "num_falls"),
    ColSpec("integer", "amb_distance"),
    ColSpec("integer", "num_stairs"),
    ColSpec("integer", "pain_severe"),
    ColSpec("integer", "pain_min_mod"),
    ColSpec("integer", "brace_none"),
    ColSpec("integer", "brace_lso"),
    ColSpec("integer", "brace_tlso"),
    ColSpec("integer", "brace_aspen"),
    ColSpec("integer", "stand"),
    ColSpec("integer", "sex_female"),
    ColSpec("integer", "sex_male"),
    ColSpec("integer", "fusion_cervical"),
    ColSpec("integer", "fusion_lumbar"),
    ColSpec("integer", "provider_Kuzak"),
    ColSpec("integer", "provider_Myers"),
    ColSpec("integer", "provider_Nolan"),
    ColSpec("integer", "provider_Smith"),
    ColSpec("integer", "provider_Woo"),
    ColSpec("integer", "prior_loc_1sh"),
    ColSpec("integer", "prior_loc_2sh"),
    ColSpec("integer", "prior_loc_alf"),
    ColSpec("integer", "prior_loc_apt"),
    ColSpec("integer", "prior_loc_ilf"),
    ColSpec("integer", "prior_loc_ipr"),
    ColSpec("integer", "prior_loc_ltc"),
    ColSpec("integer", "prior_loc_snf"),
    ColSpec("integer", "ad_cane"),
    ColSpec("integer", "ad_none"),
    ColSpec("integer", "ad_walker")
])

output_schema = Schema([ColSpec("integer")])
signature = ModelSignature(inputs=input_schema, outputs = output_schema)

mlflow.sklearn.log_model(gridsearch_lr, "logistic regression model", signature = signature)
    
best_params = gridsearch_lr.best_params_
print("Best Parameters:", best_params)

best_model = gridsearch_lr.best_estimator_
y_pred = best_model.predict(X_test_std)


accuracy = accuracy_score(y_test, y_pred)
print('Training Accuracy:', gridsearch_lr.score(X_train_std, y_train))
print("Test Set Accuracy:", accuracy)



StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 23, Finished, Available, Finished)

2024/08/08 16:40:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best Parameters: {'C': 1, 'penalty': 'l2'}
Training Accuracy: 0.7975
Test Set Accuracy: 0.76


In [22]:
from sklearn.metrics import confusion_matrix

# Create confusion matrix
cm_lr = confusion_matrix(y_true = y_test, y_pred = y_pred)

StatementMeta(, aab0ec56-16ce-4483-a8be-6dc82c225c2b, 24, Finished, Available, Finished)