# Synthefy Inference Example

This notebook demonstrates how to run synthesis and forecast inference with synthefy models.

**Examples included:**
1. Basic synthesis
2. Basic forecast
3. Forecast with zeroed-out columns (simulating future values what-if with only a subset of changes)

In [1]:
import random
import sys
import time
from pathlib import Path
from typing import List, Optional
import os
import numpy as np
import pandas as pd
import torch

os.environ["LICENSE_KEY"] = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1dWlkIjoiYmE0NWY0MDQtZGMxOC00OTM3LWFiYTktYjBhYzcwOTRhMmMzIiwiaWF0IjoxNzY4MzM0MTk0LCJleHAiOjE3NzI0MDk1OTl9.D9LwSlNYnMTskIRbU5q26BdRFlPZ5vfGzYd1vbLk1GdeyveDJSHYX2xigbFPoRfPk5BmXq__8qjGOOyQcJ_ElD9vNMwy0mM4mVCVpa9lp6yfYsyq_sLVANNVG55xvSpkZ8m67Bk5nNQVz-D-eqKVMY4_6l93h00wOyiPVC72DT57nNgjj8CP9gWRtuS_AEboHfO6OcdKJPQ2XrWNlp8lOka9ABCkRXoHBMmOuAGtsCfFyjIiN_Y6u9dfMOo-74wo2IVUysI5-Nz5guA6AEnaTCpuQI7teXK_TgaojmZ09IuK3vmbyzxs-YoAhC4DIRM21HZRDAgJDIK1z-5UoBm6Dg"
os.environ["SYNTHEFY_DATASETS_BASE"] = "/home/raimi/data"
os.environ["SYNTHEFY_PACKAGE_BASE"] = "/home/raimi/synthefy-package-external"



# Add the api directory to path
API_DIR = Path(".").resolve() / "api"
sys.path.insert(0, str(API_DIR))

from models import DataFrameModel, OneTimeSeries
from services.config_loader import get_config_loader
from services.demo_synthesis_service import DemoSynthesisService

# Constants
DEFAULT_DATASET = "oura_subset"
DEFAULT_MODEL_TYPE = "flexible"
DEFAULT_NUM_SAMPLES = 20
DEFAULT_FORECAST_LENGTH = 96
DEFAULT_GROUND_TRUTH_PREFIX = 0
DEFAULT_SEED = 123
FAKE_DATA_PATH = Path(".").resolve() / "fake_oura_subset_data.parquet"

  import pynvml  # type: ignore[import]




In [2]:
def set_seed(seed: Optional[int] = None) -> int:
    """Set random seed for reproducibility. Returns the seed used."""
    if seed is None:
        seed = int(time.time() * 1000) % 100000 + random.randint(0, 10000)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    return seed


def load_data(data_path: Path) -> pd.DataFrame:
    """Load data from parquet file."""
    if not data_path.exists():
        raise FileNotFoundError(f"Data file not found: {data_path}")
    return pd.read_parquet(data_path)


def timeseries_to_dataframe(timeseries_list: List[OneTimeSeries]) -> pd.DataFrame:
    """Convert list of OneTimeSeries to DataFrame."""
    return pd.DataFrame({
        ts.name: [np.nan if v is None else v for v in ts.values]
        for ts in timeseries_list
    })

In [3]:
def keep_only_columns_for_forecast(
    df: pd.DataFrame,
    columns_to_keep: List[str],
    forecast_length: int,
    timeseries_columns: List[str],
) -> pd.DataFrame:
    """
    Keep only specified columns, zero out all other timeseries columns for the forecast horizon.

    Use this when you want to forecast using only certain columns as known future values.
    All other timeseries columns will be zeroed out in the last forecast_length rows.

    Args:
        df: Input DataFrame
        columns_to_keep: List of column names to keep (not zero out)
        forecast_length: Number of rows from the end to zero out
        timeseries_columns: List of all timeseries column names

    Returns:
        DataFrame with only specified columns having values in the forecast horizon
    """
    df = df.copy()
    for col in timeseries_columns:
        if col not in columns_to_keep and col in df.columns:
            df.loc[df.index[-forecast_length:], col] = 0.0
    return df

In [4]:
def run_inference(
    task: str,
    df: pd.DataFrame,
    dataset_name: str = DEFAULT_DATASET,
    model_type: str = DEFAULT_MODEL_TYPE,
    num_samples: int = DEFAULT_NUM_SAMPLES,
    forecast_length: int = DEFAULT_FORECAST_LENGTH,
    ground_truth_prefix: int = DEFAULT_GROUND_TRUTH_PREFIX,
    columns_to_keep: Optional[List[str]] = None,
    seed: Optional[int] = None,
) -> pd.DataFrame:
    """
    Run synthesis or forecast inference with optional column masking.

    Args:
        task: "synthesis" or "forecast"
        df: Input DataFrame (one window)
        dataset_name: Dataset name for config loading
        model_type: "flexible" or "standard"
        num_samples: Number of synthesis runs to average
        forecast_length: For forecast, number of time steps to predict
        ground_truth_prefix: For synthesis, keep first N points from input
        columns_to_keep: For forecast, only keep these columns (zero out all other timeseries columns)
        seed: Random seed (None = time-based)

    Returns:
        DataFrame with synthetic/forecasted time series
    """
    if seed is not None:
        set_seed(seed)

    # Get timeseries columns from config
    config_loader = get_config_loader(dataset_name)
    timeseries_cols = config_loader.get_required_columns().timeseries

    # Apply column masking if specified (only keep specified columns, zero out others)
    if columns_to_keep is not None and task == "forecast":
        df = keep_only_columns_for_forecast(df, columns_to_keep, forecast_length, timeseries_cols)

    # Initialize service
    service = DemoSynthesisService(
        dataset_name=dataset_name,
        model_type=model_type,
        task_type=task,
    )

    # Convert to DataFrameModel
    columns = {col: df[col].tolist() for col in df.columns}
    data_model = DataFrameModel(columns=columns)

    # Run inference
    if task == "forecast":
        result = service.generate(
            data=data_model,
            num_samples=num_samples,
            forecast_length=forecast_length,
        )
    else:
        result = service.generate(
            data=data_model,
            num_samples=num_samples,
            ground_truth_prefix_length=ground_truth_prefix,
        )

    return timeseries_to_dataframe(result)

## Load Data and Config

In [5]:
# Set seed for reproducibility
seed = set_seed(DEFAULT_SEED)
print(f"Using seed: {seed}")

# Load config
config_loader = get_config_loader(DEFAULT_DATASET)
window_size = config_loader.get_window_size()
required_cols = config_loader.get_required_columns()

print(f"Window size: {window_size}")
print(f"Timeseries columns: {required_cols.timeseries}")

# Load data
df = load_data(FAKE_DATA_PATH)
if len(df) > window_size:
    df = df.head(window_size).copy()
print(f"Data shape: {df.shape}")

[32m2026-01-19 05:50:24.233[0m | [1mINFO    [0m | [36mservices.config_loader[0m:[36m_load_configs[0m:[36m71[0m - [1mLoading preprocessing config from: /home/raimi/synthefy-package-external/examples/configs/preprocessing_configs/config_oura_subset_preprocessing.json[0m
[32m2026-01-19 05:50:24.234[0m | [1mINFO    [0m | [36mservices.config_loader[0m:[36m_load_configs[0m:[36m77[0m - [1mLoading synthesis config from: /home/raimi/synthefy-package-external/examples/configs/synthesis_configs/config_oura_subset_synthesis.yaml[0m


Using seed: 123
Window size: 192
Timeseries columns: ['average_hrv', 'lowest_heart_rate', 'age_cva_diff']
Data shape: (192, 15)


## Example 1: Basic Synthesis (Full Scenario Simulation)

In [6]:
synthesis_result = run_inference(
    task="synthesis",
    df=df,
    num_samples=DEFAULT_NUM_SAMPLES,
    seed=DEFAULT_SEED,
)
print(f"Synthesis result: {synthesis_result.shape}")
synthesis_result.head()

[32m2026-01-19 05:50:24.265[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m126[0m - [1mInitializing DemoSynthesisService for dataset: oura_subset, model_type: flexible, task_type: synthesis[0m
[32m2026-01-19 05:50:24.265[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m130[0m - [1mModel path: /home/raimi/data/training_logs/oura_subset/Time_Series_Diffusion_Training/synthesis_oura_subset_flexible/checkpoints/best_model.ckpt[0m


Checking license key!
License key check completed successfully.


[32m2026-01-19 05:50:24.688[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m150[0m - [1mLoading scalers and encoders for dataset: oura_subset[0m
[32m2026-01-19 05:50:24.689[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m158[0m - [1mLoaded encoders: ['onehot'][0m
[32m2026-01-19 05:50:24.691[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36mvalidate_config[0m:[36m1071[0m - [1mTrain stride: 32, Val stride: 32, Test stride: 32[0m
[32m2026-01-19 05:50:24.692[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36m__post_init__[0m:[36m764[0m - [1mWindow size: 192[0m
[32m2026-01-19 05:50:24.692[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36m__post_init__[0m:[36m765[0m - [1mStride: 32[0m
[32m2026-01-19 05:50:24.692[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.pr

Synthesis result: (192, 3)


Unnamed: 0,average_hrv_synthetic,lowest_heart_rate_synthetic,age_cva_diff_synthetic
0,46.5364,59.517017,4.961435
1,48.398342,60.349331,4.313553
2,42.89537,59.128708,4.225557
3,47.66917,61.21698,4.730634
4,46.187057,57.650761,4.729599


## Example 2: Basic Forecast

In [7]:
forecast_result = run_inference(
    task="forecast",
    df=df,
    forecast_length=50,
    num_samples=DEFAULT_NUM_SAMPLES,
    seed=DEFAULT_SEED,
)
print(f"Forecast result: {forecast_result.shape}")
forecast_result.head()

[32m2026-01-19 05:50:38.638[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m126[0m - [1mInitializing DemoSynthesisService for dataset: oura_subset, model_type: flexible, task_type: forecast[0m
[32m2026-01-19 05:50:38.639[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m130[0m - [1mModel path: /home/raimi/data/training_logs/oura_subset/Time_Series_Diffusion_Training/synthesis_oura_subset_flexible/checkpoints/best_model.ckpt[0m
[32m2026-01-19 05:50:38.641[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m150[0m - [1mLoading scalers and encoders for dataset: oura_subset[0m
[32m2026-01-19 05:50:38.642[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m158[0m - [1mLoaded encoders: ['onehot'][0m
[32m2026-01-19 05:50:38.644[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.prepr

Forecast result: (192, 3)


Unnamed: 0,average_hrv_synthetic,lowest_heart_rate_synthetic,age_cva_diff_synthetic
0,51.836437,50.716801,0.081839
1,59.190922,58.026897,-2.004374
2,48.882671,53.106018,1.481648
3,48.253414,56.812786,-1.026426
4,49.902901,61.765652,-0.457199


## Example 3: Forecast with Only Specific Columns

Simulate a scenario where only some columns are known in the forecast horizon.
All other timeseries columns are zeroed out for the last `forecast_length` rows.

In [8]:
# Only keep these columns for the forecast horizon (others will be zeroed out)
columns_to_keep = ["average_hrv", "lowest_heart_rate"]
forecast_length = 50

# Show what zeroing out looks like
df_partial = keep_only_columns_for_forecast(df, columns_to_keep, forecast_length, required_cols.timeseries)

# Show a column that will be zeroed
zeroed_col = [c for c in required_cols.timeseries if c not in columns_to_keep][0]
print(f"Columns to keep: {columns_to_keep}")
print(f"Example zeroed column: {zeroed_col}")
print(f"\nOriginal values (last 5 rows):")
print(df[zeroed_col].tail())
print(f"\nAfter zeroing out:")
print(df_partial[zeroed_col].tail())

Columns to keep: ['average_hrv', 'lowest_heart_rate']
Example zeroed column: age_cva_diff

Original values (last 5 rows):
187   -0.006161
188    5.103895
189    1.003879
190    2.232057
191    3.500400
Name: age_cva_diff, dtype: float64

After zeroing out:
187    0.0
188    0.0
189    0.0
190    0.0
191    0.0
Name: age_cva_diff, dtype: float64


In [9]:
# Run forecast keeping only specified columns
forecast_partial = run_inference(
    task="forecast",
    df=df,
    forecast_length=forecast_length,
    columns_to_keep=columns_to_keep,  # Only keep these, zero out others
    num_samples=DEFAULT_NUM_SAMPLES,
    seed=DEFAULT_SEED,
)
print(f"Forecast with only {columns_to_keep}: {forecast_partial.shape}")
forecast_partial.head()

[32m2026-01-19 05:50:47.390[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m126[0m - [1mInitializing DemoSynthesisService for dataset: oura_subset, model_type: flexible, task_type: forecast[0m
[32m2026-01-19 05:50:47.390[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m130[0m - [1mModel path: /home/raimi/data/training_logs/oura_subset/Time_Series_Diffusion_Training/synthesis_oura_subset_flexible/checkpoints/best_model.ckpt[0m
[32m2026-01-19 05:50:47.392[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m150[0m - [1mLoading scalers and encoders for dataset: oura_subset[0m
[32m2026-01-19 05:50:47.393[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m158[0m - [1mLoaded encoders: ['onehot'][0m
[32m2026-01-19 05:50:47.395[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.prepr

Forecast with only ['average_hrv', 'lowest_heart_rate']: (192, 3)


Unnamed: 0,average_hrv_synthetic,lowest_heart_rate_synthetic,age_cva_diff_synthetic
0,51.836437,50.716801,0.081839
1,59.190922,58.026897,-2.004374
2,48.882671,53.106018,1.481648
3,48.253414,56.812786,-1.026426
4,49.902901,61.765652,-0.457199


## Example 4: Forecast with Single Column Only

Forecast with minimal information - only one target column has future values. (same as regular forecast)

In [10]:
# Keep only one target column
target_column = "average_hrv"

print(f"All timeseries columns: {required_cols.timeseries}")
print(f"Keeping only: [{target_column}]")

# Run forecast with only the target column
forecast_single = run_inference(
    task="forecast",
    df=df,
    forecast_length=50,
    columns_to_keep=[target_column],  # Only keep this one column
    num_samples=DEFAULT_NUM_SAMPLES,
    seed=DEFAULT_SEED,
)
print(f"\nForecast with single column: {forecast_single.shape}")
forecast_single.head()

[32m2026-01-19 05:50:56.613[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m126[0m - [1mInitializing DemoSynthesisService for dataset: oura_subset, model_type: flexible, task_type: forecast[0m
[32m2026-01-19 05:50:56.614[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m__init__[0m:[36m130[0m - [1mModel path: /home/raimi/data/training_logs/oura_subset/Time_Series_Diffusion_Training/synthesis_oura_subset_flexible/checkpoints/best_model.ckpt[0m
[32m2026-01-19 05:50:56.615[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m150[0m - [1mLoading scalers and encoders for dataset: oura_subset[0m
[32m2026-01-19 05:50:56.616[0m | [1mINFO    [0m | [36mservices.demo_synthesis_service[0m:[36m_load_scalers_and_encoders[0m:[36m158[0m - [1mLoaded encoders: ['onehot'][0m
[32m2026-01-19 05:50:56.618[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.prepr

All timeseries columns: ['average_hrv', 'lowest_heart_rate', 'age_cva_diff']
Keeping only: [average_hrv]


Generating windows: 100%|██████████| 1/1 [00:00<00:00, 3160.74it/s]
[32m2026-01-19 05:50:56.645[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36mreshape_for_timeseries[0m:[36m1397[0m - [1mFinished windowing, resulting: windows_3d_array.shape=(1, 192, 13)[0m
[32m2026-01-19 05:50:56.646[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36mprocess_groups[0m:[36m1524[0m - [1mDividing into window types[0m
[32m2026-01-19 05:50:56.646[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36mprocess_groups[0m:[36m1551[0m - [1mnum_timeseries: 0, num_continuous: 10, num_discrete: 2, num_original_discrete: 1[0m
[32m2026-01-19 05:50:56.646[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36mprocess_groups[0m:[36m1554[0m - [1mtotal_windows.shape: (1, 192, 13)[0m
[32m2026-01-19 05:50:56.998[0m | [1mINFO    [0m | [36msynthefy_pkg.preprocessing.preprocess[0m:[36mprocess_group


Forecast with single column: (192, 3)


Unnamed: 0,average_hrv_synthetic,lowest_heart_rate_synthetic,age_cva_diff_synthetic
0,51.836437,50.716801,0.081839
1,59.190922,58.026897,-2.004374
2,48.882671,53.106018,1.481648
3,48.253414,56.812786,-1.026426
4,49.902901,61.765652,-0.457199
