Sanity File System Check

In [0]:
import os

DATA_DIR = "/Workspace/Repos/win185@ensign.edu/Databricks/data"
os.listdir(DATA_DIR)


1. Import Libraries and Load Your Feature Pipeline

In [0]:
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import issparse
import os

# Paths
REPO_ROOT = "/Workspace/Repos/win185@ensign.edu/Databricks"
ARTIFACTS_DIR = os.path.join(REPO_ROOT, "artifacts")

# Load pipeline
pipeline = joblib.load(os.path.join(ARTIFACTS_DIR, "stedi_feature_pipeline.pkl"))

# Load transformed features
X_train = np.load(os.path.join(ARTIFACTS_DIR, "X_train_transformed.npy"), allow_pickle=True)
X_test = np.load(os.path.join(ARTIFACTS_DIR, "X_test_transformed.npy"), allow_pickle=True)

# Normalize
def to_float_matrix(arr):
    if arr.ndim == 0:
        arr = arr.item()
        if issparse(arr): arr = arr.toarray()
        arr = np.array(arr, dtype=float)
    elif arr.dtype == object:
        arr = np.vstack([
            x.toarray() if issparse(x) else np.array(x, dtype=float)
            for x in arr
        ])
    elif issparse(arr):
        arr = arr.toarray()
    else:
        arr = np.array(arr, dtype=float)
    return arr

X_train = to_float_matrix(X_train)
X_test = to_float_matrix(X_test)

# Load labels
y_train = pd.read_pickle(os.path.join(ARTIFACTS_DIR, "y_train.pkl"))
y_test = pd.read_pickle(os.path.join(ARTIFACTS_DIR, "y_test.pkl"))
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

2. Train Logistic Regression (Baseline Model)

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Train Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg_score = lin_reg.score(X_test, y_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
rf_reg_score = rf_reg.score(X_test, y_test)

# Optional: View results
results = {
    "Linear Regression (R²)": lin_reg_score,
    "Random Forest Regressor (R²)": rf_reg_score
}
results

Save Models and Metadata

In [0]:
import os
import joblib
from datetime import datetime

# Save location in your repo
MODELS_DIR = os.path.join(REPO_ROOT, "models")
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
run_path = os.path.join(MODELS_DIR, run_id)
os.makedirs(run_path, exist_ok=True)

# Save models
joblib.dump(lin_reg, os.path.join(run_path, "linear_regression.joblib"))
joblib.dump(rf_reg, os.path.join(run_path, "random_forest_regressor.joblib"))

# Save metadata
metadata = {
    "run_id": run_id,
    "linear_regression_r2": float(lin_reg_score),
    "random_forest_regressor_r2": float(rf_reg_score),
}
joblib.dump(metadata, os.path.join(run_path, "metadata.joblib"))

# Confirm saved path
run_path

3. Train Random Forest (Baseline Model)

In [0]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

rf_score = rf.score(X_test, y_test)
rf_score

4. Compare Baseline Models

In [0]:
results = {
    "Linear Regression (R²)": lin_reg_score,
    "Random Forest Regressor (R²)": rf_reg_score
}
results

5. Baseline Model Analysis

Looking back at the comparison, the stronger baseline was not just the one with flashier numbers but the one that behaved itself when conditions got messy. I remember running early experiments late at night, watching one option spike beautifully one moment and collapse the next, while the other trudged forward with steady confidence. That steadiness matters with noisy sensor streams because real environments are never quiet or polite. When results drift apart, curiosity should kick in. Was one approach clinging too tightly to quirks in the data? Did randomness or feature choices tip the scale? Asking those questions turns raw scores into understanding rather than trophies.

Testing before real use is a form of care. I’ve felt that knot in my stomach imagining a flawed prediction rippling outward, nudging decisions that touch actual lives. A bad model can mislabel, exclude, or quietly disadvantage people who never agreed to be part of an experiment. That’s where fairness stops being a buzzword and starts feeling personal. In data work and in discipleship, the same call shows up again and again: pay attention, act responsibly, and remember there are faces behind every output. Getting it right is less about perfection and more about humility, accountability, and choosing not to look away when errors could hurt someone else.


6. Save Your Trained Models

In [0]:
import os
import joblib
from datetime import datetime

# Set up base path inside your Databricks Repo
REPO_ROOT = "/Workspace/Repos/win185@ensign.edu/Databricks"
MODELS_DIR = os.path.join(REPO_ROOT, "models")
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
run_path = os.path.join(MODELS_DIR, run_id)
os.makedirs(run_path, exist_ok=True)

# Save trained regression models
joblib.dump(lin_reg, os.path.join(run_path, "linear_regression.joblib"))
joblib.dump(rf_reg, os.path.join(run_path, "random_forest_regressor.joblib"))

# Save model metadata (R² scores)
metadata = {
    "run_id": run_id,
    "linear_regression_r2": float(lin_reg_score),
    "random_forest_regressor_r2": float(rf_reg_score),
}

joblib.dump(metadata, os.path.join(run_path, "metadata.joblib"))

# Output the path so you know where it went
run_path

7. Zip Your Model Files into One File

In [0]:
import shutil
import os

# Define zip output path inside the same repo
zip_path = os.path.join(REPO_ROOT, "models", f"stedi_models_{run_id}.zip")

# Create zip from the saved model run directory
shutil.make_archive(
    base_name=zip_path.replace(".zip", ""),
    format="zip",
    root_dir=run_path
)

# Confirm zip path
zip_path