The `Feature View` is now saved in Hopsworks and you can retrieve it using `FeatureStore.get_feature_view(name='...', version=1)`.

---

In [35]:
#bootstrpping repo root + .env

import sys
from pathlib import Path
from dotenv import load_dotenv
import os

# Find repo root (walk up until .env found)
root = Path().resolve()
while root != root.parent and not (root / ".env").exists():
    root = root.parent

root_dir = str(root)
print("Root dir:", root_dir)

if root_dir not in sys.path:
    sys.path.append(root_dir)

load_dotenv(Path(root_dir) / ".env")

assert os.getenv("HOPSWORKS_API_KEY"), "Missing HOPSWORKS_API_KEY in .env"
print("Loaded .env successfully")

Root dir: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project
Loaded .env successfully


In [36]:
import json
import joblib
import hopsworks
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, confusion_matrix

In [37]:
# paths, and training config

DATA_DIR = Path(root_dir) / "data_cache"

DATASETS = {
    # (dataset_path, target_col, human_label)
    "energy_modeA": (DATA_DIR / "mcphases_energy_modeA.parquet", "y_energy_cls3", "Energy (Mode A)"),
    "energy_modeB": (DATA_DIR / "mcphases_energy_modeB.parquet", "y_energy_cls3", "Energy (Mode B, lag1)"),
    "mood_modeA":   (DATA_DIR / "mcphases_mood_modeA.parquet", "y_mood_stability_cls3", "Mood stability (Mode A)"),
    "mood_modeB":   (DATA_DIR / "mcphases_mood_modeB.parquet", "y_mood_stability_cls3", "Mood stability (Mode B, lag1)"),
}

ARTIFACTS_DIR = Path(root_dir) / "artifacts" / "models"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.2

# UI-consistency: user won't type day_in_study, so we drop it from features by default.
DROP_FEATURE_COLS = ["subject_id", "day_in_study"]

In [38]:
#loading and prepping dataset

def load_xy(path: Path, target_col: str, drop_cols=DROP_FEATURE_COLS):
    df = pd.read_parquet(path)

    # Basic checks
    assert "subject_id" in df.columns, f"{path.name} missing subject_id"
    assert target_col in df.columns, f"{path.name} missing target {target_col}"

    # y
    y = df[target_col].astype(int)

    # groups for subject-wise split
    groups = df["subject_id"].astype(int)

    # X
    drop_actual = [c for c in drop_cols if c in df.columns] + [target_col]
    X = df.drop(columns=drop_actual)

    # Replace inf with nan
    X = X.replace([np.inf, -np.inf], np.nan)

    return X, y, groups, df


In [39]:
#subject-wise training/testing split, so that no leakage across people

def group_split(X, y, groups, test_size=TEST_SIZE, random_state=RANDOM_STATE):
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_idx, test_idx = next(splitter.split(X, y, groups=groups))

    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

    g_train, g_test = groups.iloc[train_idx].copy(), groups.iloc[test_idx].copy()

    return X_train, X_test, y_train, y_test, g_train, g_test

In [40]:
#training and evaluating the model

def train_and_eval_tree_model(X_train, y_train, X_test, y_test, random_state=RANDOM_STATE):
    model = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("clf", RandomForestClassifier(
            n_estimators=800,
            random_state=random_state,
            n_jobs=-1,
            class_weight="balanced",
            min_samples_leaf=2,
            max_features="sqrt"
        )),
    ])
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    metrics = {
        "accuracy": float(accuracy_score(y_test, preds)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, preds)),
        "f1_macro": float(f1_score(y_test, preds, average="macro")),
        "confusion_matrix": confusion_matrix(y_test, preds).tolist(),
    }
    
    majority = int(y_train.value_counts().idxmax())
    baseline_acc = float((y_test == majority).mean())
    metrics["majority_baseline_accuracy"] = baseline_acc
    
    report = classification_report(y_test, preds, digits=4)
    return model, metrics, report

In [41]:
#training all four models

results = {}       
trained_models = {}

for key, (path, target_col, label) in DATASETS.items():
    print("\n" + "="*80)
    print(label)
    print("File:", path.name, "| Target:", target_col)

    X, y, groups, raw_df = load_xy(path, target_col)

    print("X shape:", X.shape, "| y labeled:", y.shape[0])
    print("Classes:", sorted(y.unique().tolist()))
    print("Feature count:", X.shape[1])

    X_train, X_test, y_train, y_test, g_train, g_test = group_split(X, y, groups)

    print("Train subjects:", g_train.nunique(), "| Test subjects:", g_test.nunique())
    print("Train rows:", len(X_train), "| Test rows:", len(X_test))

    model, metrics, report = train_and_eval_tree_model(
        X_train, y_train, X_test, y_test,
   )

    print("Accuracy:", metrics["accuracy"])
    print("Balanced acc:", metrics["balanced_accuracy"])
    print("F1 macro:", metrics["f1_macro"])
    print("Majority baseline acc:", metrics["majority_baseline_accuracy"])
    print("\nClassification report:\n", report)
    print("\nConfusion matrix:\n", np.array(metrics["confusion_matrix"]))
    
    # Save artifacts without overwriting
    out_dir = ARTIFACTS_DIR / key
    out_dir.mkdir(parents=True, exist_ok=True)

    joblib.dump(model, out_dir / "model.joblib")

    # Save feature columns (in order)
    feature_cols = list(X.columns)
    (out_dir / "feature_columns.json").write_text(json.dumps(feature_cols, indent=2), encoding="utf-8")

    # Save metrics
    (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")

    # Save a small readme for you
    (out_dir / "info.json").write_text(json.dumps({
        "dataset_file": path.name,
        "target_col": target_col,
        "label": label,
        "drop_feature_cols": DROP_FEATURE_COLS,
        "random_state": RANDOM_STATE,
        "test_size": TEST_SIZE
    }, indent=2), encoding="utf-8")

    results[key] = metrics
    trained_models[key] = model

print("\nDone. Saved models to:", ARTIFACTS_DIR)


Energy (Mode A)
File: mcphases_energy_modeA.parquet | Target: y_energy_cls3
X shape: (3331, 10) | y labeled: 3331
Classes: [0, 1, 2]
Feature count: 10


Train subjects: 33 | Test subjects: 9
Train rows: 2597 | Test rows: 734









































Accuracy: 0.5858310626702997
Balanced acc: 0.5941123674592651
F1 macro: 0.5870742161064743
Majority baseline acc: 0.41825613079019075

Classification report:
               precision    recall  f1-score   support

           0     0.4675    0.6117    0.5300       188
           1     0.6000    0.5179    0.5559       307
           2     0.6996    0.6527    0.6753       239

    accuracy                         0.5858       734
   macro avg     0.5890    0.5941    0.5871       734
weighted avg     0.5985    0.5858    0.5882       734


Confusion matrix:
 [[115  48  25]
 [106 159  42]
 [ 25  58 156]]

Energy (Mode B, lag1)
File: mcphases_energy_modeB.parquet | Target: y_energy_cls3
X shape: (3085, 13) | y labeled: 3085
Classe

In [26]:
print(json.dumps(results, indent=2))

{
  "energy_modeA": {
    "accuracy": 0.5858310626702997,
    "balanced_accuracy": 0.5941123674592651,
    "f1_macro": 0.5870742161064743,
    "confusion_matrix": [
      [
        115,
        48,
        25
      ],
      [
        106,
        159,
        42
      ],
      [
        25,
        58,
        156
      ]
    ],
    "majority_baseline_accuracy": 0.41825613079019075
  },
  "energy_modeB": {
    "accuracy": 0.6562962962962963,
    "balanced_accuracy": 0.6470354808590103,
    "f1_macro": 0.650019116983309,
    "confusion_matrix": [
      [
        93,
        65,
        12
      ],
      [
        61,
        185,
        34
      ],
      [
        13,
        47,
        165
      ]
    ],
    "majority_baseline_accuracy": 0.4148148148148148
  },
  "mood_modeA": {
    "accuracy": 0.5653950953678474,
    "balanced_accuracy": 0.4919857863674008,
    "f1_macro": 0.4824029711583715,
    "confusion_matrix": [
      [
        22,
        40,
        60
      ],
      [
     

In [33]:
project = hopsworks.login(engine="python")
mr = project.get_model_registry()
print("Logged in to Hopsworks, got Model Registry.")


def numeric_only(metrics: dict) -> dict:
    """Hopsworks only accepts scalar numeric values as metrics."""
    out = {}
    for k, v in metrics.items():
        if isinstance(v, (int, float)) and not isinstance(v, bool):
            out[k] = float(v)
    return out

def register_model(model_key: str, model_dir: Path, metrics: dict, description: str):
    hw_metrics = numeric_only(metrics)

    model = mr.python.create_model(
        name=f"mcphases_{model_key}_randomforest",
        metrics=hw_metrics,
        description=description,
    )

    # Upload the whole folder (contains model.joblib + feature_columns.json + full metrics.json etc.)
    model.save(str(model_dir))
    print(f"Registered: mcphases_{model_key}_randomforest | metrics={list(hw_metrics.keys())}")

for key, (path, target_col, label) in DATASETS.items():
    model_dir = ARTIFACTS_DIR / key
    assert (model_dir / "model.joblib").exists(), f"Missing model.joblib in {model_dir}"
    assert (model_dir / "metrics.json").exists(), f"Missing metrics.json in {model_dir}"

    metrics_full = json.loads((model_dir / "metrics.json").read_text(encoding="utf-8"))

    register_model(
        model_key=key,
        model_dir=model_dir,
        metrics=metrics_full,
        description=f"{label}. RandomForestClassifier. Target={target_col}. Subject-wise split."
    )

2026-01-04 02:25:46,656 INFO: Closing external client and cleaning up certificates.
2026-01-04 02:25:46,662 INFO: Connection closed.
2026-01-04 02:25:46,664 INFO: Initializing external client
2026-01-04 02:25:46,664 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-04 02:25:47,922 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3208
Logged in to Hopsworks, got Model Registry.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_energy_modeA_randomforest/1
Registered: mcphases_energy_modeA_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_energy_modeB_randomforest/1
Registered: mcphases_energy_modeB_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeA…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_mood_modeA_randomforest/1
Registered: mcphases_mood_modeA_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeB…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_mood_modeB_randomforest/1
Registered: mcphases_mood_modeB_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']
