This notebook's only purpose is to transfer the original dataset so that it can be read by the *respy* package.

In [1]:
import pandas as pd
import numpy as np

DATA_LABELS_EST = []
DATA_LABELS_EST += ['Identifier', 'Period', 'Choice', 'Wage']
DATA_LABELS_EST += ['Experience_A', 'Experience_B', 'Years_Schooling']
DATA_LABELS_EST += ['Lagged_Activity']

DATA_FORMATS_EST = dict()
for key_ in DATA_LABELS_EST:
    DATA_FORMATS_EST[key_] = np.int
    if key_ in ['Wage']:
        DATA_FORMATS_EST[key_] = np.float

columns = ['Identifier', 'Age', 'Schooling', 'Choice', 'Wage']
dtype = {'Identifier': np.int, 'Age': np.int,  'Schooling': np.int,  'Choice': 'category'} 
df_kw = pd.DataFrame(np.genfromtxt('KW_97.raw'), columns=columns).astype(dtype)  
df_kw.set_index(['Identifier', 'Age'], inplace=True, drop=False) 
df_kw['Choice'].cat.categories = ['Schooling', 'Home', 'White', 'Blue', 'Military'] 

df_base = df_kw.copy(deep=True)
df_kw.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Identifier,Age,Schooling,Choice,Wage
Identifier,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,16,6,16,11,Schooling,
6,17,6,17,12,Schooling,
6,18,6,18,13,Schooling,
6,19,6,19,14,Schooling,
6,20,6,20,15,Schooling,
6,21,6,21,16,Home,
6,22,6,22,16,White,14062.67
6,23,6,23,16,White,15921.17
6,24,6,24,16,White,18602.73
6,25,6,25,16,White,19693.95


The *respy* package does expect a colum that provides information about the period instead of the particular age.

In [2]:
df_base['Period'] = df_base['Age'] - 16

... the choices need to be provided as integers.

In [None]:
df_base['Choice'].cat.categories = [3, 4, 1, 2, -99] 

.. individuals joining the military are deleted going forward.

In [None]:
def delete_military_service(agent):
    """This function deletes all observations going forward if an individual enrolls in the military."""
    for index, row in agent.iterrows():
        identifier, period = index
        if row['Choice'] == -99:
            return agent.loc[(slice(None,None), slice(None,period - 1)), :]
    
    return agent


df_base = df_base.groupby(level='Identifier').apply(delete_military_service)
df_base.set_index(['Identifier', 'Age'], inplace=True, drop=False) 

... the schooling variable needs to be renamed

In [None]:
df_base.rename(columns={'Schooling': 'Years_Schooling'}, inplace=True)

.. the other state variables need to be added

In [None]:
def add_state_variables(agent):
    
    exp_a, exp_b, lagged_activity = 0, 0, 1
    
    for index, row in agent.iterrows():
        identifier, period = index
                
        agent['Lagged_Activity'].loc[:, period] = lagged_activity
        agent['Experience_A'].loc[:, period] = exp_a
        agent['Experience_B'].loc[:, period] = exp_b

        # Update labor market experience
        if row['Choice'] == 1:
            exp_a += 1
        elif row['Choice'] == 2:
            exp_b += 1
        else:
            pass
           
        # (0) Home, (1) Education, (2) Occupation A, and (3) Occupation B.
        # Update lagged activity
        lagged_activity = 0
        
        if row['Choice'] == 1:
            lagged_activity = 2
        elif row['Choice'] == 2:
            lagged_activity = 3
        elif row['Choice'] == 3:
            lagged_activity = 1
        else:
            pass
                   
    return agent


df_base['Lagged_Activity'] = np.nan
df_base['Experience_A'] = np.nan
df_base['Experience_B'] = np.nan

df_base = df_base.groupby(level='Identifier').apply(add_state_variables)
df_base.head()

Finally, just cutting the dataframe down to size and ensuring a correct formatting.

In [None]:
df_base = df_base[DATA_LABELS_EST].astype(DATA_FORMATS_EST)

In [None]:
with open('career_decisions_data.respy.dat', 'w') as file_:
    df_base.to_string(file_, index=False, header=True, na_rep='.')

In [None]:
stat = 117327602.84280013
np.testing.assert_equal(np.sum(df_base.sum()), stat)