In [None]:
import pandas as pd
import yaml

from skillmodels.config import TEST_DIR
from skillmodels.simulate_data import simulate_dataset

# How to simulate dataset



Below we show how to simulate dataset for a test model. 

## Getting inputs

For more details on this check out the introductory tutorial. 

In [None]:
with open(TEST_DIR / "model2.yaml") as y:
    model_dict = yaml.load(y, Loader=yaml.FullLoader)

data = pd.read_stata(TEST_DIR / "model2_simulated_data.dta")
data = data.set_index(["caseid", "period"])

params = pd.read_csv(TEST_DIR / "regression_vault" / "one_stage_anchoring.csv")
params = params.set_index(["category", "period", "name1", "name2"])

##  Simulated data without policy

In [None]:
initial_data = simulate_dataset(
    model_dict=model_dict,
    params=params,
    data=data,
)
initial_data["anchored_states"]["states"]

## Why do I need data to simulate data?

The data you pass to simulate_data contains information on observed factors and control variables. Those are not part of the latent factor model and a standard model specification does not have enough information to generate them. 

If you have a model without control variables and observed factors, you can simply pass `n_obs` instead of `data`.

##  Simulated data with policy

In [None]:
policies = [
    {"period": 0, "factor": "fac1", "effect_size": 0.2, "standard_deviation": 0.0},
    {"period": 1, "factor": "fac2", "effect_size": 0.1, "standard_deviation": 0.0},
]

In [None]:
data_after_policies = simulate_dataset(
    model_dict=model_dict,
    params=params,
    data=data,
)
data_after_policies["anchored_states"]["states"]