TO DO: Set seed for generator of initial conditions.

In [1]:
import pickle
import numpy as np
import pandas as pd
import math
import yaml

We import the model specification parameters and externally defined constants here.

In [2]:
# Import specified definitions only from given notebook
import ipynb.fs
from .defs.shared_constants import MISSING_INT, MISSING_FLOAT
from .defs.shared_auxiliary import draw_disturbances
from .defs.read import read_init_file
from .defs.read import init_dict_to_attr_dict

from .defs.shared_auxiliary import calculate_wage_systematic
from .defs.shared_auxiliary import calculate_period_wages
from .defs.shared_auxiliary import calculate_consumption_utilities
from .defs.shared_auxiliary import calculate_total_utilities
from .defs.shared_auxiliary import calculate_utilities
from .defs.shared_auxiliary import calculate_continuation_values

In [3]:
# Read in initialization file as attr_dict
attr_dict = read_init_file('toy_model_init_file.yml')

In [4]:
# Import the final output of pyth_create_state_space, args
# In the modular implementation pyth_create_state_space will be called by by pyth_solve
# pyth_solve is executed before pyth_simulate
file_name = "args_file.pkl"
# Open the file for reading
file_object = open(file_name,'rb')  
# load the object from the file into var args
state_space_args = pickle.load(file_object)

In [5]:
# Import the final output of pyth_backward_induction, periods_emax
# In the modular implementation pyth_create_state_space will be called by by pyth_solve
# pyth_solve is executed before pyth_simulate
file_name = "periods_emax_file.pkl"
# Open the file for reading
file_object = open(file_name,'rb')  
# load the object from the file into var args
periods_emax = pickle.load(file_object)

Then, we need to define additional function called in the loop to determine agents choices. 

In [6]:
def extract_individual_covariates (educ_years, educ_min, i):
    """Constructs additional covariates given agent indicator."""
    
    # Determine education level given number of years of education
    # Would it be more efficient to do this somewhere else?

    # Unpack state space components
    educ_years_i = educ_years[i]

    # Extract education information
    if (educ_years_i <= 10):
        educ_level = [1,0,0]

    elif (educ_years_i > 10) and (educ_years_i <= 12):
        educ_level = [0,1,0]

    else:
        educ_level = [0,0,1]

    educ_years_idx = educ_years_i - educ_min
    
    # Return function output
    return educ_years_i, educ_level, educ_years_idx

In [7]:
# Test ensure that simulated values of initial conditions are reproducible
educ_years_test = list(range(10, 15))
np.random.seed(123)
educ_years_test = np.random.choice(educ_years_test, 10)
educ_years_test

array([12, 14, 12, 11, 13, 12, 13, 11, 11, 10])

In [8]:
def pyth_simulate(attr_dict, state_space_args, periods_emax):
    """Simulate agent experiences."""
    
    # Unpack objects from agrs
    states_all, states_number_period, mapping_states_index, max_states_period = state_space_args[0], state_space_args[1], state_space_args[2], state_space_args[3]
    
    # Unpack parameter from the model specification
    educ_min = attr_dict['INITIAL_CONDITIONS']['educ_min']
    educ_max = attr_dict['INITIAL_CONDITIONS']['educ_max']
    num_periods = attr_dict['GENERAL']['num_periods']
    num_agents_sim = attr_dict['SIMULATION']['num_agents_sim']
    seed_sim = attr_dict['SIMULATION']['seed_sim']
    shocks_cov = attr_dict['DERIVED_ATTR']['shocks_cov']
    optim_paras = attr_dict['PARAMETERS']['optim_paras']
    delta = attr_dict['CONSTANTS']['delta']

    educ_years = list(range(educ_min, educ_max + 1))
    educ_years = np.random.choice(educ_years, num_agents_sim)

    # Create draws for simulated sample
    draws_sim = draw_disturbances((num_periods, num_agents_sim), shocks_cov, seed_sim)

    # Start count over all simulations/row (number of agents times number of periods)
    count = 0

    # Initialize container for the final output
    num_columns = 14 # count of the information units we wish to record
    dataset = np.tile(MISSING_FLOAT, (num_agents_sim*num_periods, num_columns))

    # Loop over all agents
    for i in range(num_agents_sim):


        # Construct additional education information
        educ_years_i, educ_level, educ_years_idx = extract_individual_covariates (educ_years, educ_min, i)

        # Extract the indicator of the initial state for the individual
        # depending on the individuals initial condition
        initial_state_index = mapping_states_index[educ_years_idx, educ_years_idx, 0, 0, 0]

        # Assign the initial state as current state
        current_state = states_all[educ_years_idx, initial_state_index, :].copy()

        # Loop over all remaining
        for period in range(num_periods):
            
            # Record agent identifier, period number, and level of education
            dataset[count, :2] = i, period, 
            dataset[count, 2:3] = educ_years_i
            
            # Ensure that the simulation starts only in the period 
            # in which the individual enters the model after having compldeted education
            if period < educ_years_idx:
                count += 1
                continue

            # Extract state space components
            choice_lagged, exp_p, exp_f = current_state[1], current_state[2], current_state[3]

            # Look up the indicator for the current state
            k = mapping_states_index[period, educ_years_i - educ_min, choice_lagged, exp_p, exp_f]

            # Calculate choice specific value functions
            # for individual, period and state space point

            # Extract the error term draws corresponding to
            # period number and individual
            corresponding_draws = draws_sim[period, i, :]

            # Calculate correspongind flow utilities
            flow_utilities, consumption_utilities, period_wages, wage_systematic = calculate_utilities(attr_dict,
                                                                                                       educ_level,
                                                                                                       exp_p,
                                                                                                       exp_f,
                                                                                                       optim_paras,
                                                                                                       corresponding_draws)

            # Obtain continuation values for all choices
            continuation_values = calculate_continuation_values(attr_dict,
                                                                mapping_states_index,
                                                                periods_emax,
                                                                period,
                                                                educ_years_idx,
                                                                exp_p,
                                                                exp_f)

            # Calculate total values for all choices
            value_functions = flow_utilities + delta * continuation_values

            # Determine choice as option with highest choice specific value function
            max_idx = np.argmax(value_functions)


            # Record period experiences
            dataset[count, 3:4] = max_idx
            dataset[count, 4:5] = wage_systematic
            dataset[count, 5:8] = period_wages[:]
            dataset[count, 8:11] = consumption_utilities[:]
            dataset[count, 11:14] = flow_utilities[:]


            # Update state space component experience
            current_state[max_idx + 1] += 1

            # Update state space component choice_lagged
            current_state[1] = max_idx

            # Update simulation/row count
            count += 1
    
    # Return function output
    return dataset

In [9]:
dataset = pyth_simulate(attr_dict, state_space_args, periods_emax)

Finally, we want to record the dataset as a Pandas Dataframe.

In [10]:
def replace_missing_values (arguments):
    """Replace MISSING_FLOAT with NAN."""
    
    # Antibugging
    assert isinstance(arguments, tuple) or isinstance(arguments, np.ndarray)

    if isinstance(arguments, np.ndarray):
        arguments = (arguments,)

    rslt = tuple()

    for argument in arguments:
        
        # Transform to float array to evaluate missing values
        argument_internal = np.asfarray(argument)

        # Determine missing values
        is_missing = argument_internal == MISSING_FLOAT
        if np.any(is_missing):
            # Replace missing values
            argument = np.asfarray(argument)
            argument[is_missing] = np.nan

        rslt += (argument,)

    # Align interface
    if len(rslt) == 1:
        rslt = rslt[0]

    # Function output
    return rslt

In [11]:
# Create fixed objects needed to record simulated dataset to Pandas Dataframe

# Define column lables
DATA_LABLES_SIM = []
DATA_LABLES_SIM += ["Identifier", "Period"]
DATA_LABLES_SIM += ["Years of Education"]
DATA_LABLES_SIM += ["Choice"]
DATA_LABLES_SIM += ["Systematic Wage"]
DATA_LABLES_SIM += ["Period Wage N", "Period Wage P", "Period Wage F"]
DATA_LABLES_SIM += ["Consumption Utility N", "Consumption Utility P", "Consumption Utility F"]
DATA_LABLES_SIM += ["Flow Utility N", "Flow Utility P", "Flow Utility F"]

# Define data types for data set columns
DATA_FORMATS_SIM = dict()
for key_ in DATA_LABLES_SIM:
    DATA_FORMATS_SIM[key_] = np.int
    if key_ in ["Choice",
                "Systematic Wage",
                "Period Wage N",
                "Period Wage P",
                "Period Wage F",
                "Consumption Utility N",
                "Consumption Utility P",
                "Consumption Utility F",
                "Flow Utility N",
                "Flow Utility P",
                "Flow Utility F"]:
        DATA_FORMATS_SIM[key_] = np.float

In [12]:
# Create data frame from simulated dataset
data_frame = pd.DataFrame(
    data = replace_missing_values(dataset), columns = DATA_LABLES_SIM
)

# Set specific columns to desired data types
data_frame = data_frame.astype(DATA_FORMATS_SIM)

# Define identifier for unique observation in the data frame
data_frame.set_index(["Identifier", "Period"], drop=False, inplace=True)

In [13]:
data_frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Identifier,Period,Years of Education,Choice,Systematic Wage,Period Wage N,Period Wage P,Period Wage F,Consumption Utility N,Consumption Utility P,Consumption Utility F,Flow Utility N,Flow Utility P,Flow Utility F
Identifier,Period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0,0,0,11,,,,,,,,,,,
0,1,0,1,11,2.0,5.147946,13.954296,7.506351,3.326827,-0.141488,-0.114451,-0.118794,-0.141488,-0.125983,-0.133406
0,2,0,2,11,2.0,9.920092,8.317627,9.189173,7.054196,-0.141488,-0.102194,-0.077983,-0.141488,-0.112491,-0.087575
0,3,0,3,11,2.0,14.692237,23.743788,5.969781,21.979951,-0.141488,-0.130114,-0.041267,-0.141488,-0.143224,-0.046342
0,4,0,4,11,1.0,19.464383,26.577924,56.145839,8.364371,-0.141488,-0.037089,-0.070888,-0.141488,-0.040826,-0.079607
0,5,0,5,11,2.0,20.924660,53.846531,17.987670,65.114608,-0.141488,-0.070157,-0.022463,-0.141488,-0.077226,-0.025226
0,6,0,6,11,1.0,25.696806,44.028445,448.356096,43.445736,-0.141488,-0.011586,-0.028176,-0.141488,-0.012754,-0.031642
0,7,0,7,11,1.0,27.157082,202.848266,140.660211,0.458052,-0.141488,-0.022176,-0.360596,-0.141488,-0.024410,-0.404948
0,8,0,8,11,2.0,28.617359,23.330666,12.434848,234.257491,-0.141488,-0.086270,-0.010967,-0.141488,-0.094963,-0.012316
0,9,0,9,11,2.0,33.389504,27.723824,42.254272,36.356419,-0.141488,-0.043488,-0.031132,-0.141488,-0.047870,-0.034961
