In [0]:
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import issparse
pipeline = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/stedi_feature_pipeline.pkl"
)
X_train_transformed = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/X_train_transformed.pkl"
)
X_test_transformed = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/X_test_transformed.pkl"
)
def to_float_matrix(arr: np.ndarray) -> np.ndarray:
   """
   Ensures that input arrays (possibly object-dtype, sparse, or 0-d) are converted to a 2-D float matrix.
   This is necessary because saved feature arrays may have inconsistent shapes or types after transformation,
   and ML models require numeric 2-D arrays for training and prediction.
   """
   if arr.ndim == 0:
       # Handle 0-d array directly
       arr = arr.item()
       if issparse(arr):
           arr = arr.toarray()
       arr = np.array(arr, dtype=float)
   elif arr.dtype == object:
       arr = np.array([
           x.toarray() if issparse(x) else np.array(x, dtype=float)
           for x in arr
       ])
       arr = np.vstack(arr)
   elif issparse(arr):
       arr = arr.toarray()
   else:
       arr = np.array(arr, dtype=float)
   return arr
X_train = to_float_matrix(X_train_transformed)
X_test = to_float_matrix(X_test_transformed)
y_train = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/y_train.pkl"
)
y_test = joblib.load(
   "/Workspace/Users/pedrofp@ensign.edu/stedi_curated_pipeline/y_test.pkl"
)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [0]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=300)
log_reg.fit(X_train, y_train)

log_reg_score = log_reg.score(X_test, y_test)
log_reg_score

In [0]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

rf_score = rf.score(X_test, y_test)
rf_score

In [0]:
results = {
    "Logistic Regression baseline": log_reg_score,
    "Random Forest baseline": rf_score
}
results

## Model Comparison Discussion
* Which baseline model performed better?
    - Logistic Regression
* Which model seems more stable for noisy sensor data?
    - Random Forest, because it averages many trees and handles noise better.
* What questions do you have about why the numbers differ?
    - Are features, preprocessing, and data balance the same for both models?
* Why is it important to test your model before using it in real life?
    - To ensure it works on new data and avoids harmful mistakes.
* If a model is wrong, who could be affected?
    - Anyone relying on its predictions: users, customers, organizations.
* Why does fairness matter in both data science and discipleship?
    - It prevents harm, builds trust, and reflects respect, we do pur best to pick the best option and that is always better for everyone
    

## Model Training Summary

* I trained two machine-learning algorithms: Logistic Regression and Random Forest.
* I tuned the max_iter hyperparameter for Logistic Regression and used default settings for Random Forest.
* Random Forest performed best, achieving higher accuracy on the test set.
* I selected Random Forest because it is more robust to noisy sensor data and generalizes better for step detection.

In [0]:
import os
import joblib
from datetime import datetime
# Create a unique folder name (prevents overwriting files)
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
base_dir = f"/Workspace/Users/pedrofp@ensign.edu/stedi_models/{run_id}"
os.makedirs(base_dir, exist_ok=True)
# Save trained models
joblib.dump(log_reg, f"{base_dir}/log_reg.joblib")
joblib.dump(rf, f"{base_dir}/random_forest.joblib")
# Save accuracy information (metadata)
metadata = {
"run_id": run_id,
"logistic_regression_accuracy": float(log_reg_score),
"random_forest_accuracy": float(rf_score),
}
joblib.dump(metadata, f"{base_dir}/metadata.joblib")
base_dir

In [0]:
import shutil
zip_path = f"/Workspace/Users/pedrofp@ensign.edu/stedi_models/{run_id}.zip"
shutil.make_archive(zip_path.replace(".zip", ""), "zip", base_dir)
zip_path