In [0]:
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import issparse
pipeline = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/stedi_feature_pipeline.pkl"
)
X_train_transformed = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/X_train_transformed.pkl"
)
X_test_transformed = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/X_test_transformed.pkl"
)
def to_float_matrix(arr: np.ndarray) -> np.ndarray:
   """
   Ensures that input arrays (possibly object-dtype, sparse, or 0-d) are converted to a 2-D float matrix.
   This is necessary because saved feature arrays may have inconsistent shapes or types after transformation,
   and ML models require numeric 2-D arrays for training and prediction.
   """
   if arr.ndim == 0:
       # Handle 0-d array directly
       arr = arr.item()
       if issparse(arr):
           arr = arr.toarray()
       arr = np.array(arr, dtype=float)
   elif arr.dtype == object:
       arr = np.array([
           x.toarray() if issparse(x) else np.array(x, dtype=float)
           for x in arr
       ])
       arr = np.vstack(arr)
   elif issparse(arr):
       arr = arr.toarray()
   else:
       arr = np.array(arr, dtype=float)
   return arr
X_train = to_float_matrix(X_train_transformed)
X_test = to_float_matrix(X_test_transformed)
y_train = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/y_train.pkl"
)
y_test = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/y_test.pkl"
)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_reg_params = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs", "liblinear"]
}

log_reg_grid = GridSearchCV(
    LogisticRegression(max_iter=300),
    log_reg_params,
    cv=3,
    scoring="accuracy"
)

log_reg_grid.fit(X_train, y_train)

log_reg_best_params = log_reg_grid.best_params_
log_reg_best_score = log_reg_grid.best_score_

log_reg_best_params, log_reg_best_score

In [0]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(),
    rf_params,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

rf_best_params = rf_grid.best_params_
rf_best_score = rf_grid.best_score_

rf_best_params, rf_best_score

In [0]:
results = {
    "Logistic Regression (tuned)": log_reg_best_score,
    "Random Forest (tuned)": rf_best_score
}
results


In [0]:
# Choose the better model based on best_score_
if rf_best_score > log_reg_best_score:
    best_model = rf_grid.best_estimator_
    best_model_name = "Random Forest"
else:
    best_model = log_reg_grid.best_estimator_
    best_model_name = "Logistic Regression"

best_model_name, best_model

In [0]:
best_model = rf_grid.best_estimator_   # <-- replace with your winner
joblib.dump(best_model, "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/stedi_best_model.pkl")

In [0]:
import os
os.listdir('/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/')

   
- Which model performed best?
    Random Forest performed best.
- How do you know? (accuracy? precision? recall?)
    It had the highest accuracy during cross-validation.
- What hyperparameters improved performance?
    Increasing n_estimators and tuning max_depth improved Random Forest.
- Any surprising results?
    Logistic Regression was less accurate than expected, likely due to noisy sensor data.
- What would you test next if you had more time? (for example, additional parameters, different models, or more data)
    I would test more hyperparameters, try other models like Gradient Boosting, and use more data.
- How could hyperparameter tuning accidentally make a model unfair or biased?
    It can make a model biased if tuning only optimizes for accuracy and ignores performance for different groups, causing the model to favor the majority group.
- Why is transparency important? How does the gospel teach us about honest evaluation?
    Transparency is always important, we need to make sure things are clear, because if it isn't clear how we are getting a optimized result we might end up making a mistake for another set of date. The gospel teaches us to be true to ourselves and for what we believe, and we need to make honest evaluation of ourselves to continue in this path, looking at the things we need to improve on