<a href="https://colab.research.google.com/github/Niraj-82/patient-activity-forecasting/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import json
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor


In [36]:
def parse_time(ts):
    # timestamps come in ISO format with Z
    return datetime.fromisoformat(ts.replace("Z", "+00:00"))

In [37]:
with open("timeseries-data.json") as f:
    step_raw = json.load(f)

with open("categorical-data.json") as f:
    clinical_raw = json.load(f)

In [38]:
rows = []

for r in step_raw:
    if r.get("metric") != "STEPS":
        continue

    day = parse_time(r["start"]).date()
    rows.append({
        "date": day,
        "steps": int(r.get("count", 0))
    })

df_steps = pd.DataFrame(rows)

daily_steps = (
    df_steps.groupby("date", as_index=False)
            .steps.sum()
            .rename(columns={"steps": "daily_steps"})
)

In [39]:
full_days = pd.date_range(
    start=daily_steps["date"].min(),
    end=daily_steps["date"].max(),
    freq="D"
)

daily = (
    daily_steps.set_index("date")
               .reindex(full_days)
               .fillna(0)
               .rename_axis("date")
               .reset_index()
)


In [40]:
daily["dow"] = daily["date"].dt.weekday
daily["week"] = daily["date"].dt.isocalendar().week.astype(int)

daily["lag_1"] = daily["daily_steps"].shift(1)
daily["lag_7"] = daily["daily_steps"].shift(7)
daily["lag_30"] = daily["daily_steps"].shift(30)

daily[["lag_1", "lag_7", "lag_30"]] = (
    daily[["lag_1", "lag_7", "lag_30"]].fillna(0)
)


In [41]:
birth_year = clinical_raw.get("birthYear")
daily["age"] = daily["date"].dt.year - birth_year if birth_year else np.nan

daily["is_female"] = 1 if clinical_raw.get("gender") == "FEMALE" else 0


In [42]:
therapies = clinical_raw.get("therapies", [])
therapy_ids = list(set(t["therapyId"] for t in therapies))

def therapy_features(d):
    out = {}
    active_count = 0

    for tid in therapy_ids:
        active = [
            t for t in therapies
            if t["therapyId"] == tid
            and parse_time(t["startDate"]).date() <= d
            and (t["endDate"] is None or parse_time(t["endDate"]).date() >= d)
        ]
        flag = 1 if active else 0
        out[f"is_on_therapy_{tid}"] = flag
        active_count += flag

    out["active_therapies"] = active_count
    return pd.Series(out)

therapy_df = daily["date"].dt.date.apply(therapy_features)
daily = pd.concat([daily, therapy_df], axis=1)


In [43]:
side_effects = clinical_raw.get("sideEffects", [])

def side_effect_features(d):
    active = [
        s["intensity"]
        for s in side_effects
        if parse_time(s["startDate"]).date() <= d
        and (s["endDate"] is None or parse_time(s["endDate"]).date() >= d)
    ]

    return pd.Series({
        "active_side_effect_count": len(active),
        "max_side_effect_intensity": max(active) if active else 0
    })

daily[["active_side_effect_count", "max_side_effect_intensity"]] = (
    daily["date"].dt.date.apply(side_effect_features)
)


In [44]:
diagnoses = clinical_raw.get("diagnoses", [])
diag_ids = list(set(d["diagnosisOptionsId"] for d in diagnoses))

def diagnosis_flags(d):
    out = {}
    for did in diag_ids:
        active = [
            x for x in diagnoses
            if x["diagnosisOptionsId"] == did
            and parse_time(x["startDate"]).date() <= d
            and (x["endDate"] is None or parse_time(x["endDate"]).date() >= d)
        ]
        out[f"diagnosis_active_{did}"] = 1 if active else 0
    return pd.Series(out)

diag_df = daily["date"].dt.date.apply(diagnosis_flags)
daily = pd.concat([daily, diag_df], axis=1)


In [45]:
events = clinical_raw.get("events", [])
relapses = [
    parse_time(e["startDate"]).date()
    for e in events if e.get("event") == "RELAPSE"
]

def days_since_relapse(d):
    past = [x for x in relapses if x <= d]
    return (d - max(past)).days if past else -1

daily["days_since_relapse"] = daily["date"].dt.date.apply(days_since_relapse)


In [46]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

y = daily["daily_steps"]
split = int(len(y) * 0.8)

train_ts = y.iloc[:split]
test_ts = y.iloc[split:]

baseline = SARIMAX(
    train_ts,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 7),
    enforce_stationarity=False,
    enforce_invertibility=False
)

baseline_fit = baseline.fit(disp=False)
baseline_preds = baseline_fit.forecast(len(test_ts))

mse = mean_squared_error(test_ts, baseline_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_ts, baseline_preds)

print("Baseline RMSE:", round(rmse, 2))
print("Baseline MAE:", round(mae, 2))


Baseline RMSE: 10121.98
Baseline MAE: 7743.9


In [47]:
features = (
    ["dow", "week", "lag_1", "lag_7", "lag_30",
     "age", "is_female",
     "active_therapies",
     "active_side_effect_count",
     "max_side_effect_intensity",
     "days_since_relapse"]
    + list(diag_df.columns)
    + [c for c in therapy_df.columns if c.startswith("is_on_therapy")]
)

X = daily[features]
y = daily["daily_steps"]

X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

model = RandomForestRegressor(
    n_estimators=150,
    max_depth=8,
    random_state=42
)

model.fit(X_train, y_train)
preds = model.predict(X_test)

mse_ml = mean_squared_error(y_test, preds)
rmse_ml = np.sqrt(mse_ml)
mae_ml = mean_absolute_error(y_test, preds)

print("ML RMSE:", round(rmse_ml, 2))
print("ML MAE:", round(mae_ml, 2))


ML RMSE: 7850.26
ML MAE: 5568.53


I tried simpler linear models first, but they struggled with sudden drops during relapse periods.
Random Forest handled nonlinear effects better without being too hard to explain.

In [48]:
pd.Series(
    model.feature_importances_,
    index=features
).sort_values(ascending=False).head(10)


Unnamed: 0,0
lag_1,0.671515
lag_7,0.112028
week,0.081437
lag_30,0.07853
dow,0.031947
age,0.024542
is_female,0.0
active_therapies,0.0
active_side_effect_count,0.0
max_side_effect_intensity,0.0


In [50]:
history = daily.tail(30).copy()
future_rows = []

for i in range(365):
    row = history.iloc[-1].copy()
    row["date"] += pd.Timedelta(days=1)
    row["dow"] = row["date"].weekday()
    row["week"] = row["date"].week

    X_row = pd.DataFrame([row[features]], columns=features)
    pred = max(0, int(model.predict(X_row)[0]))


    row["daily_steps"] = pred
    history = pd.concat([history, row.to_frame().T])

    future_rows.append({
        "Date": row["date"].date(),
        "Predicted_Steps": pred,
        "Trend_Component": int(row["lag_7"]),
        "Exogenous_Impact": int(pred - row["lag_7"])
    })

forecast_df = pd.DataFrame(future_rows)
forecast_df.head()


Unnamed: 0,Date,Predicted_Steps,Trend_Component,Exogenous_Impact
0,2025-10-22,6113,579,5534
1,2025-10-23,6005,579,5426
2,2025-10-24,5686,579,5107
3,2025-10-25,5145,579,4566
4,2025-10-26,4992,579,4413
