In [41]:
#bootstrapping the environment - repo root, .env, settings

import sys
from pathlib import Path
from dotenv import load_dotenv
import os

# 1) Find repo root (folder that contains .env or .git)
def find_repo_root(start: Path | None = None) -> Path:
    p = (start or Path.cwd()).resolve()
    for _ in range(25):
        if (p / ".env").exists() or (p / ".git").exists() or (p / "pyproject.toml").exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return (start or Path.cwd()).resolve()

root_dir = find_repo_root()
print("Repo root:", root_dir)

# 2) Ensure imports work
if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))
    print("Added to PYTHONPATH:", root_dir)

# 3) Load .env FIRST
load_dotenv(root_dir / ".env")

# 4) Sanity checks
assert os.getenv("HOPSWORKS_API_KEY"), "Missing HOPSWORKS_API_KEY in .env"
assert os.getenv("DATA_PATH"), \
    "Missing data path in .env (set DATA_PATH=... recommended)"

print("Loaded .env successfully")

# 5) Now import settings (after .env is loaded)
from mlfs.mcphases.config import settings
print("DATA_PATH resolved to:", settings.DATA_PATH)

Repo root: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project
Loaded .env successfully
DATA_PATH resolved to: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/data/mcphases/raw


In [42]:
# loadig the backfilled master parquet

import pandas as pd
import numpy as np
from pathlib import Path

MASTER_PATH = Path(root_dir) / "data_cache" / "mcphases_master_daily.parquet"
#/path-to-root-project/data_cache/mcphases_master_daily.parquet

df = pd.read_parquet(MASTER_PATH)

print("Loaded:", MASTER_PATH)
print("Shape:", df.shape)
display(df.head(3))

Loaded: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/data_cache/mcphases_master_daily.parquet
Shape: (5659, 74)


Unnamed: 0,subject_id,study_interval,is_weekend,day_in_study,phase,lh,estrogen,pdg,flow_volume,flow_color,...,height_and_weight__weight_2024,subject_info__birth_year,subject_info__gender,subject_info__ethnicity,subject_info__education,subject_info__sexually_active,subject_info__self_report_menstrual_health_literacy,subject_info__age_of_first_menarche,sleep_duration_minutes,sleep_duration_hours
0,1,2022,True,1,Follicular,2.9,94.2,,Not at all,Not at all,...,,1999,Woman,White,"Some university/ post-secondary, no degree",Yes,,14,617.0,10.283333
1,1,2022,False,2,Follicular,1.2,226.3,,Not at all,Not at all,...,,1999,Woman,White,"Some university/ post-secondary, no degree",Yes,,14,258.5,4.308333
2,1,2022,False,3,Follicular,3.5,276.8,,Not at all,Not at all,...,,1999,Woman,White,"Some university/ post-secondary, no degree",Yes,,14,530.0,8.833333


In [43]:
#defining pks, and sorting

KEYS = ["subject_id", "day_in_study"]
df = df.sort_values(KEYS).reset_index(drop=True)

assert df[KEYS].isna().sum().sum() == 0, "Missing keys"
assert df.duplicated(subset=KEYS).sum() == 0, "Duplicate subject-day rows still exist"
print("Keys OK.")

Keys OK.


In [44]:
# What unique values exist (including weird ones)?
print("Unique moodswing values (top 30):")
display(df["moodswing"].astype("string").value_counts(dropna=False).head(30))

print("Unique fatigue values (top 30):")
display(df["fatigue"].astype("string").value_counts(dropna=False).head(30))

Unique stress values (top 30):


stress
<NA>               2332
Moderate           1048
High                610
Low                 567
Not at all          436
Very Low/Little     369
Very High           290
2                     4
3                     2
1                     1
Name: count, dtype: Int64

Unique fatigue values (top 30):


fatigue
<NA>               2328
Moderate            936
High                685
Low                 552
Very Low/Little     473
Not at all          444
Very High           241
Name: count, dtype: Int64

In [45]:
#definig targets (mood + energy/fatigue) from existing cols
#mappig ordinal target cats to numbers and then to 3 classes

LEVELS_6 = ["Not at all", "Very Low/Little", "Low", "Moderate", "High", "Very High"]
LEVEL_MAP_6 = {lvl: i for i, lvl in enumerate(LEVELS_6)}  # 0..5

MAX6 = len(LEVELS_6) - 1  # 5

def map_ordinal_6(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip()
    s = s.str.replace(r"\s+", " ", regex=True)
    return s.map(LEVEL_MAP_6).astype("float")


# fatigue -> energy (invert so higher = better)
df["fatigue_num"] = map_ordinal_6(df["fatigue"])
df["y_energy_num"] = MAX6 - df["fatigue_num"]

# moodswing -> mood stability (invert so higher = better)
df["moodswing_num"] = map_ordinal_6(df["moodswing"])
df["y_mood_stability_num"] = MAX6 - df["moodswing_num"]

def to_3class_from_6(x):
    # 0-1 low, 2-3 mid, 4-5 high
    return pd.cut(x, bins=[-0.1, 1.5, 3.5, 5.1], labels=[0, 1, 2]).astype("float")

df["y_energy_cls3"] = to_3class_from_6(df["y_energy_num"])
df["y_mood_stability_cls3"] = to_3class_from_6(df["y_mood_stability_num"])

#rows that have no ground-truth labels/targets
print("Missing fatigue:", df["fatigue"].isna().mean())
print("Missing y_energy_cls3:", df["y_energy_cls3"].isna().mean())
print("Missing moodswing:", df["moodswing"].isna().mean())
print("Missing y_mood_stability_cls3:", df["y_mood_stability_cls3"].isna().mean())

#not losing any extra rows, mapping is perfect

Missing fatigue: 0.4113801024916063
Missing y_energy_cls3: 0.4113801024916063
Missing moodswing: 0.41332390881781234
Missing y_mood_stability_cls3: 0.41332390881781234


In [46]:
# quantifying how many target labeled rows there are
n_total = len(df)
n_energy = df["y_energy_cls3"].notna().sum()
n_mood = df["y_mood_stability_cls3"].notna().sum()
n_both = df[["y_energy_cls3","y_mood_stability_cls3"]].notna().all(axis=1).sum()

print("Total rows:", n_total)
print("Energy labeled rows:", n_energy, f"({n_energy/n_total:.1%})")
print("Mood stability labeled rows:", n_mood, f"({n_mood/n_total:.1%})")
print("Both labeled rows:", n_both, f"({n_both/n_total:.1%})")

#will be training them separetly so no need for both targets to exist per row 
#DECISION POINT == 2 models (energy, mood stability predictors) per mode (mode A = only wearables as inputs, mode B = including lagged targets), 
    # == 3-class classigication for higher accuracy due to smaller dataset limitations

Total rows: 5659
Energy labeled rows: 3331 (58.9%)
Mood stability labeled rows: 3320 (58.7%)
Both labeled rows: 3320 (58.7%)


In [47]:
#mpping ordinal selfreport feature cols
ORDINAL_FEATURE_COLS = [c for c in ["cramps", "headaches", "sleepissue", "stress", "flow_volume"] if c in df.columns]

for c in ORDINAL_FEATURE_COLS:
    df[c + "_num"] = map_ordinal_6(df[c])

print("Created ordinal numeric features:", [c + "_num" for c in ORDINAL_FEATURE_COLS])

Created ordinal numeric features: ['cramps_num', 'headaches_num', 'sleepissue_num', 'stress_num', 'flow_volume_num']


In [53]:
#universal input features defining

KEYS = ["subject_id", "day_in_study"]

FEATURES_BASE = [
    #calendar
    "phase", "is_weekend",

    # strict wearables-friendly numeric features
    "sleep_duration_minutes",
    "resting_heart_rate__value",
]

#ordinal -> num alr
FEATURES_SELFREPORT = [
    "cramps_num",
    "stress_num",
    "headaches_num",
    "sleepissue_num",
]

FEATURES_MODE_A = [c for c in (FEATURES_BASE + FEATURES_SELFREPORT) if c in df.columns]

# ensure keys not duplicated
FEATURES_MODE_A = [c for c in FEATURES_MODE_A if c not in KEYS]
FEATURES_MODE_A = list(dict.fromkeys(FEATURES_MODE_A))

print("Mode A STRICT features:", FEATURES_MODE_A)

Mode A STRICT features: ['phase', 'is_weekend', 'sleep_duration_minutes', 'resting_heart_rate__value', 'cramps_num', 'stress_num', 'headaches_num', 'sleepissue_num']


In [49]:
#MODE A DATASET (only wearables as features), less accurate

# Energy dataset
df_energy_A = df[KEYS + FEATURES_MODE_A + ["y_energy_cls3"]].dropna(subset=["y_energy_cls3"]).copy()
df_energy_A["y_energy_cls3"] = df_energy_A["y_energy_cls3"].astype(int)

# Mood stability dataset (Mode A)
df_mood_A = df[KEYS + FEATURES_MODE_A + ["y_mood_stability_cls3"]].dropna(subset=["y_mood_stability_cls3"]).copy()
df_mood_A["y_mood_stability_cls3"] = df_mood_A["y_mood_stability_cls3"].astype(int)

print("Mode A energy rows:", len(df_energy_A))
print("Mode A mood rows:", len(df_mood_A))

Mode A energy rows: 3331
Mode A mood rows: 3320


In [63]:
#MODE B DATASET (wearables & lagged targets as features), less accurate

LAGS = [1]

# Energy Mode B: add lagged energy labels as features
SELECT_ENERGY = list(dict.fromkeys(KEYS + FEATURES_MODE_A + ["y_energy_cls3"]))
df_energy_B = df.loc[:, SELECT_ENERGY].copy().sort_values(KEYS)
for lag in LAGS:
    df_energy_B[f"lag{lag}_energy"] = df_energy_B.groupby("subject_id")["y_energy_cls3"].shift(lag)

# keep rows where current y exists and lags exist
df_energy_B = df_energy_B.dropna(subset=["y_energy_cls3"] + [f"lag{l}_energy" for l in LAGS]).copy()
df_energy_B["y_energy_cls3"] = df_energy_B["y_energy_cls3"].astype(int)

# Mood Mode B: add lagged mood stability labels as features
SELECT_MOOD = list(dict.fromkeys(KEYS + FEATURES_MODE_A + ["y_mood_stability_cls3"]))
df_mood_B = df.loc[:, SELECT_MOOD].copy().sort_values(KEYS)
for lag in LAGS:
    df_mood_B[f"lag{lag}_mood"] = df_mood_B.groupby("subject_id")["y_mood_stability_cls3"].shift(lag)

df_mood_B = df_mood_B.dropna(subset=["y_mood_stability_cls3"] + [f"lag{l}_mood" for l in LAGS]).copy()
df_mood_B["y_mood_stability_cls3"] = df_mood_B["y_mood_stability_cls3"].astype(int)

print("Mode B energy rows:", len(df_energy_B))
print("Mode B mood rows:", len(df_mood_B))

Mode B energy rows: 3085
Mode B mood rows: 3084


In [66]:
#one hotting only nominal cols, not ordinal

def one_hot_nominal_only(df_in: pd.DataFrame, nominal_cols: list[str]) -> pd.DataFrame:
    out = df_in.copy()
    cols = [c for c in nominal_cols if c in out.columns]
    if cols:
        out = pd.get_dummies(out, columns=cols, dummy_na=True)
    return out

NOMINAL_COLS = ["phase"] #add more if tjere

energy_A_ml = one_hot_nominal_only(df_energy_A, NOMINAL_COLS)
energy_B_ml = one_hot_nominal_only(df_energy_B, NOMINAL_COLS)
mood_A_ml   = one_hot_nominal_only(df_mood_A, NOMINAL_COLS)
mood_B_ml   = one_hot_nominal_only(df_mood_B, NOMINAL_COLS)


OUT_DIR = Path(root_dir) / "data_cache"
OUT_DIR.mkdir(exist_ok=True)

energy_A_ml.to_parquet(OUT_DIR / "mcphases_energy_modeA.parquet", index=False)
energy_B_ml.to_parquet(OUT_DIR / "mcphases_energy_modeB.parquet", index=False)
mood_A_ml.to_parquet(OUT_DIR / "mcphases_mood_modeA.parquet", index=False)
mood_B_ml.to_parquet(OUT_DIR / "mcphases_mood_modeB.parquet", index=False)

print("Saved 4 datasets to:", OUT_DIR)


Saved 4 datasets to: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/data_cache


In [67]:
energy_B_ml

Unnamed: 0,subject_id,day_in_study,is_weekend,sleep_duration_minutes,resting_heart_rate__value,cramps_num,stress_num,headaches_num,sleepissue_num,y_energy_cls3,lag1_energy,phase_Fertility,phase_Follicular,phase_Luteal,phase_Menstrual,phase_nan
1,1,2,False,258.5,80.407307,1.0,3.0,5.0,5.0,0,0.0,False,True,False,False,False
2,1,3,False,530.0,84.686869,1.0,2.0,4.0,5.0,0,0.0,False,True,False,False,False
3,1,4,False,449.0,83.852219,1.0,2.0,1.0,5.0,0,0.0,True,False,False,False,False
4,1,5,False,,0.000000,1.0,2.0,1.0,4.0,0,0.0,True,False,False,False,False
5,1,6,False,376.0,82.077053,1.0,1.0,1.0,3.0,1,0.0,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5550,50,85,True,483.0,58.608020,0.0,4.0,3.0,5.0,1,0.0,False,False,True,False,False
5551,50,86,False,140.5,58.132595,0.0,3.0,2.0,2.0,0,1.0,False,False,True,False,False
5552,50,87,False,523.0,57.626328,0.0,3.0,1.0,2.0,1,0.0,False,False,True,False,False
5553,50,88,False,229.5,56.868725,1.0,3.0,1.0,4.0,1,1.0,False,False,False,True,False
