In [1]:
import os
import sys
root_dir = os.path.abspath('../')
sys.path.append(root_dir)

import seaborn as sns
import matplotlib.pyplot as plt

# --- Global style setup ---
sns.set_theme(style="white", context="notebook")

# Define a sequential green palette (dark â†’ light)
green_palette = sns.color_palette("Greens", n_colors=6)  # can adjust n_colors if needed
sns.set_palette(green_palette)

plt.rcParams.update({
    "font.size": 12,
    "axes.labelweight": "semibold",
    "axes.titleweight": "bold",
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "legend.fontsize": 11,
    "figure.facecolor": "white",
    "axes.edgecolor": "#333333",
    "axes.linewidth": 0.8,
    # optional: default line color order from dark to light green
    "axes.prop_cycle": plt.cycler("color", green_palette),
})

### Goal

We would like to model

$x_{i,t}=\Lambda f_t + \epsilon_{i,t}$

* $i$ = patient index
* $t$ = time (irregular visits per patient)
* $x_{i,t} \in R^p$ Patients embedding information (realization from the latent factor)
* $f_t \in R^r$ Shared latent temporatl factors (underlying population health states)
* $\Lambda \in R^{p\times r}$ Factor loadings (relationship between embeddings and latent factors)

### Mapping

| Model Element | Interpretation |
|---------------|----------------|
| Observed variables | Daily metrics (Counts by department, age group, diagnosis group) |
| Latent factors | Underlying patient-type intensities that jointly influence those metrics |
| Factor loadings | How strongly each variable with each latent patient type |
| Factor dynamics | How those patient types evolve over time (trends, cycles, shocks) |

In [2]:
import pandas as pd

# Dynamic Factor Model - engineered data
dfm_data = pd.read_parquet(
    os.path.join(root_dir, "data/processed/hana_ent/dfm_daily.parquet")
)
full_range = pd.date_range(dfm_data.index.min(), dfm_data.index.max(), freq='D')
dfm_data = dfm_data.reindex(full_range)
dfm_data = dfm_data.fillna(0)

# Resample to weekly frequency
dfm_data = dfm_data.resample('W').sum()

# Supply data
supply = pd.read_parquet(
    os.path.join(root_dir, "./data/processed/hana_ent/supply.parquet")
)
full_range = pd.date_range(supply.index.min(), supply.index.max(), freq='D')
supply = supply.reindex(full_range)
supply = supply.fillna(0)

# Resample to weekly frequency
supply = supply.resample('W').sum()

# Calendar characteristics


In [58]:
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor
import numpy as np

from joblib import Parallel, delayed
from models.data_container.rolling_window_feeder import RollingWindowFeederW

import warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)

from models.data_container.rolling_window_feeder import RollingWindowFeederW

rw = RollingWindowFeederW(window_size_weeks=104, forward_window_weeks=1)
feeds = list(rw.feed(dfm_data.reset_index(), 'index'))

accumulated_x_test, accumulated_x_true = [], []

for i,(train_df, test_df, train_range, test_range) in enumerate(feeds):
    iter_dir = os.path.join(root_dir, "data/prediction/hana_ent/dfm_v2", f"iter_{i:03d}")
    os.makedirs(iter_dir, exist_ok=True)

    # X
    # Scale the model
    scaler = StandardScaler()
    scaled_train_df = scaler.fit_transform(train_df)
    scaled_test_df = scaler.transform(test_df)

    # Add noise to the model. - Current matrix is heavy-zeroed
    scaled_train_df += 1e-8 * np.random.randn(*scaled_train_df.shape)
    scaled_test_df += 1e-8 * np.random.randn(*scaled_test_df.shape)

    # y
    y_train, y_true = (
        supply.loc[train_range[0] : train_range[-1]].iloc[:-1],
        supply.loc[test_range[0] : test_range[-1]].iloc[:-1]
    )

    # Fit Dynamic Factor Model
    model = DynamicFactor(scaled_train_df, k_factors=3, factor_order=1, error_cov_type='diagonal')
    res = model.fit(disp=False)

    # Latent factors: From train_df
    factors = pd.DataFrame(
        res.factors.smoothed.T,
        index=pd.date_range(train_range[0], train_range[-1], inclusive='left', freq='W'),
        columns=[f"Factor{i+1}" for i in range(3)]
    )

    loadings = pd.DataFrame(
        res.params[:train_df.shape[1] * 3].reshape((train_df.shape[1], 3)),
        columns=[f"Loading{i+1}" for i in range(3)],
        index=train_df.columns
    )

    # Forecast projected X_t from latent factors
    X_test_forecast = res.get_forecast(steps=1)
    scaled_X_test_forecast = X_test_forecast.predicted_mean
    X_test_forecast = scaler.inverse_transform(scaled_X_test_forecast)

    assert y_train.shape[0] == scaled_train_df.shape[0]
    assert y_true.shape[0] == scaled_test_df.shape[0]

    # === Save the factors plot & original data ===
    factors.to_csv(os.path.join(iter_dir, "factors.csv"))
    loadings.to_csv(os.path.join(iter_dir, "loadings.csv"))
    train_df.to_csv(os.path.join(iter_dir, "train_data.csv"))
    test_df.to_csv(os.path.join(iter_dir, "test_data.csv"))

    # Plot the latent factors
    plt.figure(figsize=(10, 6))
    plt.plot(factors.index, factors["Factor1"], label="Factor1")
    plt.plot(factors.index, factors["Factor2"], label="Factor2")
    plt.plot(factors.index, factors["Factor3"], label="Factor3")
    plt.title(f"Latent Factors (Iteration {i:03d})")
    plt.xlabel("Date")
    plt.ylabel("Factor value")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(iter_dir, "factors_plot.png"))
    plt.close()
    
    # Accumulate the X_test_forecast
    accumulated_x_test.append(
        pd.DataFrame(
            X_test_forecast, 
            index=test_df.index, 
            columns=test_df.columns
        ).clip(lower=0)
    )
    accumulated_x_true.append(test_df)

x_tests = pd.concat(accumulated_x_test)
x_trues = pd.concat(accumulated_x_true)

# Save them
x_tests.to_csv(os.path.join(root_dir, "data/prediction/hana_ent/dfm_v2/x_tests.csv"))
x_trues.to_csv(os.path.join(root_dir, "data/prediction/hana_ent/dfm_v2/x_trues.csv"))

# MAE
x_tests.sub(x_trues, axis=0).pow(2).mean()



AssertionError: 

In [None]:
x_tests = pd.concat(accumulated_x_test)
x_trues = pd.concat(accumulated_x_true)

# Save them
x_tests.to_csv(os.path.join(root_dir, "data/prediction/hana_ent/dfm_v2/x_tests.csv"))
x_trues.to_csv(os.path.join(root_dir, "data/prediction/hana_ent/dfm_v2/x_trues.csv"))

# MAE
x_tests.sub(x_trues, axis=0).pow(2).mean()

male                        237270.014741
female                      220949.173395
age_0_10                       934.594253
age_10_20                      100.000000
age_20_30                    42258.593832
age_30_40                    12842.365922
age_40_50                    69456.035220
age_50_60                    59742.481154
age_60_70                     6551.731774
age_70_80                       67.433007
age_80_plus                   6168.127786
dep_Dermatology                  0.000000
dep_Ear, Nose and Throat    866425.731572
dep_Internal Medicine          676.000000
dep_Radiology                    0.000000
dep_Unknown                      5.390550
A02                              0.000000
A03                              0.633344
A04                              0.474649
A10                              0.001314
A11                              0.000000
B02                            101.671606
B05                           4701.585884
C01                              0