# Estimation of Model in Chapter 2 of Thesis

In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize as MIN, show_options as SO
import time
import statsmodels.formula.api as sm
from statsmodels.iolib.summary2 import summary_col

%load_ext Cython

### File Names for the dataframe

In [2]:
#File path of STATA dataset

Path_Data = "/Users/idiosyncrasy58/Dropbox/Documents/College/"+ \
            "Universitat Autonoma de Barcelona/IDEA - Economics/"+ \
            "Doctoral Thesis Ideas/Migration/IFLS/Project Files/"+ \
            "Temp Files/Longitudinal Adult Children Data for Estimation.dta"

In [3]:
index = {1:'Low-Skilled, Everywhere Else',2:'Low-Skilled, Java',
         3:'High-Skilled, Everywhere Else',4:'High-Skilled, Java'}
 
col_keep = ['pidlink','sex','age','MaxSchYrs','ParentalSchAvg','MarketCode','InterMarket_FamilyMig',
            'Skill_Level_2','Skill_Level_2_Parents','Wage_2_HH','Educ_3']

#Read in the file
Data = pd.read_stata(Path_Data,columns=col_keep,convert_categoricals=False).rename(columns={'pidlink':'Household'})

Data['Educ_3'] = Data.Educ_3.astype('int')

Data.head()

Unnamed: 0,Household,sex,age,MaxSchYrs,ParentalSchAvg,MarketCode,InterMarket_FamilyMig,Skill_Level_2,Skill_Level_2_Parents,Wage_2_HH,Educ_3
0,1220003,1.0,19.0,13.0,3.0,1,0,1.0,0.0,1.047,1
1,1250003,1.0,14.0,3.0,4.0,1,0,0.0,0.0,1.047,0
2,1290003,3.0,14.0,4.0,0.0,1,0,0.0,0.0,1.047,0
3,2010007,3.0,17.0,13.0,7.0,1,0,1.0,0.0,1.047,1
4,2040003,3.0,14.0,6.0,0.5,1,0,0.0,0.0,1.047,0


In [4]:
#Create the Variables highlighting where the parents moved to

def Market_Move(Curr_Loc, Choice):
    
    if Curr_Loc==1 and Choice==1:
        Move = 2
    elif Curr_Loc==2 and Choice==1:
        Move = 1
    elif Choice==0:
        Move = Curr_Loc
    
    return Move
    
def State_Def(Loc, Skill):
    
    if Loc==1 and Skill==0:
        State = 1
    elif Loc==1 and Skill==1:
        State = 3
    elif Loc==2 and Skill==0:
        State = 2
    else: State = 4
        
    return State

def Dec_Def(Loc_Choice, Educ_Choice):
    
    if Loc_Choice==0 and Educ_Choice==0:
        Decision = 1
    elif Loc_Choice==0 and Educ_Choice==1:
        Decision = 2
    elif Loc_Choice==1 and Educ_Choice==0:
        Decision = 3
    else: Decision = 4
        
    return Decision

Data['MarketCode_Move'] = Data.apply(lambda row: 
                                     Market_Move(row['MarketCode'],row['InterMarket_FamilyMig']), 
                                     axis=1)

Data['Parent_State'] = Data.apply(lambda row: 
                                     State_Def(row['MarketCode'],row['Skill_Level_2_Parents']), 
                                     axis=1)

Data['Decision'] = Data.apply(lambda row: 
                                     Dec_Def(row['InterMarket_FamilyMig'],row['Educ_3']), 
                                     axis=1)

### Econometric Model Code

The following cells generate the estimation of the model. Likely, where possible, code will be written in Cython when necessary (for example: the inner loop and the maximization via GSL).

##### Model
I write the following value function from the point of view of the old-age agent, indexed by $(d,g,t=2)$ (dynasty $d$, generation $g$, and period of life $t$):
\begin{equation}
V_{d,g,t=2}(z,\varepsilon) = \max_{I_k\in I} \text{  } \sum_k I_{k}\left\{v_{d,g,t=2}(z,k) + \varepsilon_{k} \right\} \end{equation}
where
\begin{equation}
v_{d,g,t=2}(z,k)=u(c)+\alpha \text{E}\left[V_{d,g',t=2}(z',\varepsilon')\big|z,I_{k}=1\right]
\end{equation}

and utility, being linear and the same across generations, is $u(c)=ln(c)$.

##### Specification
The econometric specification of the model is the following:

Wages: Agents receieve the median wages offered in each market based on their skill-level (where I take the skill-level of the first generation as pre-determined). Agents choose at the beginning of the period where they want to live and pay the cost to move there. Simultaneously, they choose whether to educate their child or not in their chosen location. 

The budget constraint is given by:
\begin{equation}
c = w^{hh}(h,j) - \delta\cdot 1(\ell\neq j) - \phi_{j}\cdot 1(e=1)
\end{equation}

and 
\begin{equation}
w^{hh}(h,j)=med(w(h,j))
\end{equation}

#### Parameters

In [5]:
#Parameters and parameter vector to pass into function
alpha = 0.99**18        #altruism parameter = 0.99^18 
                          #(18 years old when child is supposed to finish schooling)
tot_states = 4          #number of states
tot_decisions = 4       #number of decisions

#### State Space Variable

In [6]:
#Market Adult Wages Array
#rows:      regions
#columns:   skill levels

wage_R1_ls=1.047  #Everywhere Else
wage_R1_hs=2.513  #Everywhere Else
wage_R2_ls=1      #Island of Java
wage_R2_hs=2.626  #Island of Java

#Strucutre a wage array for quick access 
wage_lst=[[wage_R1_ls]*2+[wage_R2_ls]*2,
          [wage_R2_ls]*2+[wage_R1_ls]*2,
          [wage_R1_hs]*2+[wage_R2_hs]*2,
          [wage_R2_hs]*2+[wage_R1_hs]*2]

wages = np.array(wage_lst, dtype='d').reshape((tot_states,tot_decisions))

#### Transition Function

In [7]:
tran_st=[[1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1],
         [0,1,0,0],[0,0,0,1],[1,0,0,0],[0,0,1,0],
         [1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1],
         [0,1,0,0],[0,0,0,1],[1,0,0,0],[0,0,1,0]]

tran_func = np.array(tran_st, dtype='d').reshape((tot_states,tot_decisions,tot_states))

#### Function to permute the Costs for vectorization

In [8]:
def Cost(Param,transform=[0,0]):
    
    #Education costs
    educ_lst=[[0,Param[1],0,Param[2]],
              [0,Param[2],0,Param[1]]]*2

    educ_cost=np.array(educ_lst, dtype='d').reshape((tot_states,tot_decisions))
    
    if transform[0]==1:

        Trans = np.exp(Param[0])/(1+np.exp(Param[0]))
    
        #Moving Costs
        move_lst=[[0]*2+[Trans]*2]
        
    else:
        
        #Moving Costs
        move_lst=[[0]*2+[Param[0]]*2]

    move_cost=np.array(move_lst, dtype='d')
    
    #total cost
    cost = educ_cost + move_cost 
    
    return cost

#### The utility function

In [9]:
def Utility(Param,transform=[0,0]):
    
    cost = Cost(Param,transform)
    
    consumption = wages - cost
    
    #replace consumption with something really small if it is negative
    consumption[consumption<=0] = 1e-5
    
    #return the the functional form of the utility as a function of consumption
    utility = np.log(consumption) 
    
    return utility

#### Function to get the Value Functions and the Branches and Limbs probabilities

In [10]:
def Value_Func(Param,transform=[0,0]):
    
    #generate the utility matrix
    utility = Utility(Param,transform)
    
    #Calculate the value function
    
    #Final Period: Expected wages based on Child(T=1) = Adult(T=2), Adult(T=1) = Effectively Dead
    
    Sum = 0

    for t in range(100):
        Sum += (1/(1/alpha)**t) * wages[:,0]
    
    V = np.log(Sum) + np.euler_gamma
    
    #Go back to the original wage array: sum all possible wages
    #V = np.log(np.exp(wages-cost).sum(axis=1)) + np.euler_gamma
    
    #Initial Period: Adult(T=1) = Alive, Child(T=1)
    
    v = utility + alpha*tran_func.dot(V)
    
    return (v,V)

In [11]:
def Tree(Param,transform=[0,0]):

    v = Value_Func(Param,transform)[0]
    
    if transform[1]==1:
        rho_1 = np.exp(Param[3])/(1+np.exp(Param[3])) 
        rho_2 = np.exp(Param[4])/(1+np.exp(Param[4]))
    else:
        rho_1 = Param[3] #Educate branch
        rho_2 = Param[4] #No Educate branch

    #Branches and Limb Numerators and Denominators
    
    denom_1 = np.exp(v[:,(1,3)]/rho_1).sum(axis=1).reshape(4,1)
    denom_2 = np.exp(v[:,(0,2)]/rho_2).sum(axis=1).reshape(4,1)
    
    branch_1 = np.exp( rho_1 * np.log(denom_1) )
    branch_2 = np.exp( rho_2 * np.log(denom_2) )
    
    branch_denom = branch_1 + branch_2
                      
    #Calculate the branch probabilities
    
    p_1 = branch_1/branch_denom #Educate branch
    p_2 = branch_2/branch_denom #No Educate branch
                      
    branches = np.hstack((p_2,p_1))
    
    #Calculate the limb probabilities
    
    limbs_1 = np.exp(v[:,(1,3)]/rho_1)/denom_1
    limbs_2 = np.exp(v[:,(0,2)]/rho_2)/denom_2
    
    limbs = np.vstack((limbs_2[:,0],limbs_1[:,0],limbs_2[:,1],limbs_1[:,1])).T
    
    return (branches, limbs)

#### Map Branches and Limbs to the data based on the decisions taken and the states of the individual

In [12]:
def Limb_Map(State, Decision, limbs):

    Prob = limbs[State-1,Decision-1]

    return np.log(Prob)

def Branch_Map(State, Educ_Dec, branches):
    
    Prob = branches[State-1,Educ_Dec]
    
    return np.log(Prob)

def Tree_Data(branches, limbs):

    Data['Branches'] = Data.apply(lambda row: 
                             Branch_Map(row['Parent_State'],row['Educ_3'],branches), axis=1)
    
    Data['Limbs'] = Data.apply(lambda row: 
                             Limb_Map(row['Parent_State'],row['Decision'],limbs), axis=1)

#### Calculation of the Log-Likelihood Function

In [13]:
def LLF(Params, Data, transform=[0,0]):
    
    #Solve the Dynamic Programming Problem and get the probabilities
    branches, limbs = Tree(Params,transform)
    
    #Map the CCPs to the Data
    Tree_Data(branches, limbs)
    
    #Calculate the log-likelihood value
    LLF = -1*( Data.Branches.sum() + Data.Limbs.sum() )
    
    return LLF

#### Calculate the CCPs based on the joint decision

In [14]:
def CCP(Params,transform=[0,0]):
    
    branches, limbs = Tree(Params,transform)
    
    branch_1 = limbs[:,(0,2)]*branches[:,0].reshape(4,1)
    branch_2 = limbs[:,(1,3)]*branches[:,1].reshape(4,1)
    
    CCP = np.vstack((branch_1,branch_2)).T.reshape(4,4).T
    
    return CCP

### Estimation of the Model

#### Unconstrained Maximization of the LLF

Use the minimization routine with BFGS method.

In [15]:
time1 = time.time()

#Create the start values based on values from the log utility specification (one with most frequent equilibrium obs.)
start_val = np.array([2.436,0.629,0.619,0.5,0.5]) #np.hstack((np.random.randn(3),np.random.rand(2)))

Param_Final = MIN(LLF, start_val, method='BFGS', args=(Data,), options={'gtol':1e-3, 'maxiter':200, 'disp':True})                                                                                     
print(str((time.time()-time1)/60)+' minutes')

Optimization terminated successfully.
         Current function value: 2779.833530
         Iterations: 20
         Function evaluations: 189
         Gradient evaluations: 27
0.5683652798334757 minutes


In [16]:
print(Param_Final.x.round(3))

[ 1.802  0.598  0.562  0.723  0.317]


Implied correlations

In [17]:
corr_e1_e2 = 1 - np.array(Param_Final.x[-2:]**2)
print(corr_e1_e2)

[ 0.47723051  0.89941397]


Standard Errors

In [18]:
std_err = np.sqrt(np.diag(Param_Final.hess_inv))
print(std_err.round(4))

[ 0.0736  0.0285  0.021   0.1222  0.1267]


t-statistics

In [19]:
t = Param_Final.x/std_err
print(abs(t))

[ 24.50074941  21.02462394  26.73254434   5.91713534   2.50406443]


Test consistency with utility maximization

In [20]:
for i in range(1,3):
    Test = (Param_Final.x[-i]-1)/(std_err[-i])
    print(Test)

-5.39138171584
-2.26669065179


#### MC the results to test stability of estimates

#Test the stability of parameter estimates

Loop = 100
MC_Loop = np.empty((5,Loop))

for i in range(Loop):
    start_val = np.hstack((np.random.randn(3),np.random.rand(2)))
    Param_Final = MIN(LLF, start_val, method='BFGS', args=(Data,1), options={'gtol':1e-3, 'maxiter':200, 'disp':False})
    MC_Loop[:,i] = Param_Final.x

Place results into a dataframe

MC_df = pd.DataFrame(np.round(MC_Loop.T,2),
                     columns={'Move_Cost':0,'Educ_Cost_Elsewhere':1,
                              'Educ_Cost_Jawa':2,'Scale_Educate':3,
                              'Scale_No_Educate':4})

MC_df['Count'] = 1

MC_df = ( MC_df.groupby(['Move_Cost', 'Educ_Cost_Elsewhere','Educ_Cost_Jawa','Scale_No_Educate','Scale_Educate'], as_index=False)
               .count()
               .sort_values(['Count'], ascending=False) )
         
MC_df.Count = MC_df.Count/100

MC_df.to_csv('MC_Output_Nested_Logit')

MC_df

#### Compare the Estimated CCPs with the Empirical CCPs

Generate the matrix of the Empirical CCPS from the above.

Assumption on mutually independent choices implies that:

\begin{equation}
P(Move=\{0,1\} \cap Educ=\{0,1\}\mid State=S)=P(Move=\{0,1\}\mid State=S)\cdot P(Educ=\{0,1\}\mid Move=\{0,1\} \cap State=S)
\end{equation}

So we can multiply the above probabilities for 'InterMarket_FamilyMig' and 'Educ_3' (as well as their respective compliments) since they are conditional on the State due to the grouping.

In [21]:
#Group the data according to the states and then collapse the data along the desired dimensions.

#Calculate the first conditional probability P(Educ={0,1} | Move={0,1}, State=S)
Educ_DF = Data.groupby(['Parent_State','InterMarket_FamilyMig']).agg({'Educ_3':'mean'})
Educ_DF['No_Educ'] = 1 - Educ_DF.Educ_3
Educ_DF.sort_index(axis=1, ascending=False, inplace=True)

#Calculate the second conditional probability P(Move={0,1}|State=S)
Mig_DF = Data.groupby('Parent_State').agg({'InterMarket_FamilyMig':'mean'})
Mig_DF['No_Mig'] = 1 - Mig_DF.InterMarket_FamilyMig
Mig_DF.sort_index(axis=1, ascending=False, inplace=True)

#Get the underlying numpy array from the DataFrame, reshape to broadcast multiplication
Mig_Prob = Mig_DF.loc[:,['No_Mig','InterMarket_FamilyMig']].get_values().reshape((8,1))

Educ_Prob = Educ_DF.loc[:,['No_Educ','Educ_3']].get_values()

#Create the CCP DataFrame
Columns = ['NoEduc_NoMig','Educ_NoMig','NoEduc_Mig','Educ_Mig']

Emp_CCPs = pd.DataFrame(np.round(Educ_Prob*Mig_Prob,3).reshape((tot_states,tot_decisions)),columns=Columns)

Emp_CCPs

Unnamed: 0,NoEduc_NoMig,Educ_NoMig,NoEduc_Mig,Educ_Mig
0,0.512,0.474,0.006,0.008
1,0.536,0.455,0.003,0.006
2,0.248,0.706,0.009,0.036
3,0.296,0.675,0.01,0.02


The estimated CCPs can be obtained by plugging back in the estimated parameters of the model into the function that calculates the CCPs from the value functions

In [22]:
Est_CCPs = CCP(Param_Final.x) 
print(Est_CCPs.round(3))

[[ 0.529  0.471  0.     0.   ]
 [ 0.505  0.495  0.     0.   ]
 [ 0.368  0.583  0.01   0.039]
 [ 0.354  0.628  0.006  0.011]]


## Sensitivity: Test different levels of $\alpha$

alpha_test = [0.5,0.75,0.99**18,0.956]

Param_dict = {}
Std_Err_dict = {}
Like_dict = {}

for a in alpha_test:
    
    alpha = a
    
    start_val = np.array([3.436,0.629,0.619,0.5,0.5])
    Param_Final = MIN(LLF, start_val, method='BFGS', args=(Data,), options={'gtol':1e-3, 'maxiter':200, 'disp':True})
    
    std_err = np.sqrt(np.diag(Param_Final.hess_inv))
    
    Param_dict[str(a)] = Param_Final.x.round(3)
    Std_Err_dict[str(a)] = std_err.round(3)
    
    Like_dict[str(a)] = Param_Final.fun

print(Param_dict)

## Simulation of the Model

### Cython Code for Simulation of the model based on the Optimal Policy Function (CCP array)

The below code is adapted from the code created in the simulation code folder

In [23]:
%%cython 

#!python
#cython: boundscheck=False,wraparound=False,nonecheck=False,cdivision=True

# Cython code to optimise in C the Simulation of the model portion of the code

##################### Import Modules and math functions ######################

#Cython and C functions (this is faster than calling external C function math libs)
cimport cython

from libc.stdlib cimport rand, RAND_MAX, calloc, free, abort
from libc.math cimport HUGE_VAL, log

####################### Assign the global variables ##########################

#These will be passed into functions automatically without 
#having to call them up explicitely

cdef Py_ssize_t tot_states, tot_decisions

##############################################################################
####### Define the functions that will assist the simulation module ##########
##############################################################################

############ Random Numbers, Random States, and Random Shocks functions

#Random number generator on interval [0,1]
cdef inline double rand_value() nogil:
    return rand()/<double>RAND_MAX

############### Function and auxiliaries determining the next state

#This function rewrites array with the cumulative sum through recursion
cdef void cum_sum(double *arr, size_t index=4-1) nogil:
    if index<=0: return
    cum_sum(arr, index-1)
    arr[index] += arr[index-1]

#This function will determine the index of the transition function 
#based on the cumulative probabilities 
cdef unsigned short find_interval(double x, double *arr) nogil:
    cdef Py_ssize_t i
    
    for i in range(tot_states):
        if x<arr[i]:
            return <unsigned short>i

#This function will generate the next state based on the transition
#function probabilites (a discrete value)
cdef unsigned short Next_State(double[:] tran) nogil:
    cdef:
        double x
        double *array
        unsigned short index
        Py_ssize_t i
    
    array=<double*> calloc(tot_states, sizeof(double))
    
    if array==NULL: abort()

    try:
        #generate a random number to help determine the next state
        x = rand_value()
        
        #copy the transition function values into the array to prevent rewrite
        for i in range(tot_states):
            array[i]=tran[i]
        
        #rewrite the array into the cumulative sum of the elements
        cum_sum(array)
        
        #the next state is the return value of the function
        #(the array index) + 1 to create the next state
        index = find_interval(x, array) + 1
    
        return index

    finally:
        free(array)  


################### Function for filling in the Simulated data array

#This function will calculate the frequency of decisions for each generation
cdef void Data(unsigned short* Data, Py_ssize_t curr_hh,
               Py_ssize_t dec, Py_ssize_t state, 
               unsigned short* next_state_arr) nogil:
    
    #fill in the state
    Data[0 + curr_hh*4] = <unsigned short>state
    Data[3 + curr_hh*4] = next_state_arr[curr_hh]
    
    #fill in the moving decision
    if dec==3 or dec==4:
        Data[1 + curr_hh*4] = 1
    #fill in the education decision
    if dec==2 or dec==4:
        Data[2 + curr_hh*4] = 1


############ Function defining the simulation of the model: CCP Distribution ################
def Sim_Model_CCP(double[:,:] CCP, double[:,:,:] tranny, 
                  unsigned short[:,:] Sim_Data, double[:] init_states):
    
    #declare and assign the globals
    global tot_states, tot_decisions
    
    tot_states = tranny.shape[0]       #Dimension of the states is given by the number of rows in the trans array
    tot_decisions = tranny.shape[1]    #Dimension of the decisions is diven by the rows of one of the trans array
    
    #declare the types for variables and arrays
    cdef:
        Py_ssize_t decision, state 
        Py_ssize_t HH = Sim_Data.shape[0] #Number of households is given by the first dimension of the Sim_Data array
        
        #define iterators
        Py_ssize_t j
        
        #define array types
        unsigned short* states
        
    #allocate arrays
    states = <unsigned short*> calloc(HH, sizeof(unsigned short))
        
    #check that memory was allocated:
    if states==NULL: abort()  
        
    #simulate the model

    try:
        #inner loop the households (should be parallelizable)
        for j in range(HH):

            #grab the household's state from the matrix
            state = Next_State(init_states)

            #compare values, return the decision (index+1)
            decision = Next_State(CCP[state-1,:])

            #rewrite the state array with the next generation's value:
            states[j] = Next_State(tranny[state-1,decision-1,:])

            #save the decisions of the houshold
            Data(&Sim_Data[0,0],j,decision,state,states)
            
    finally:
        free(states)

### Python code for the execution of the simulation

Grab the original distribution of the first generation from the data, and use the same number of households as the data

In [24]:
#Group the data according to the states and then collapse the data along the desired dimensions.

function = {'Household':'count','InterMarket_FamilyMig':'mean', 'Educ_3':'mean'}

Stats = Data.groupby('Parent_State').agg(function).round(3)

Stats['Prop_in_State'] = (Stats.Household/len(Data)).round(3)

Stats.rename(index=index)

Unnamed: 0_level_0,Household,InterMarket_FamilyMig,Educ_3,Prop_in_State
Parent_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Low-Skilled, Everywhere Else",1251,0.014,0.482,0.343
"Low-Skilled, Java",1557,0.01,0.461,0.427
"High-Skilled, Everywhere Else",330,0.045,0.742,0.091
"High-Skilled, Java",507,0.03,0.694,0.139


In [25]:
#Define the initial states of the model to follow that of :

#Grab the initial distribution of states from the data
init_states = Stats.Prop_in_State.get_values() #np.array([0,0,0,1.0])

#number of households
HH = len(Data)

#Simulated Data to be filled by the simulator
Data_Sim = np.zeros((HH,4), dtype='uint16')

Run the simulation and time it

In [26]:
time1 = time.time()

Sim_Model_CCP(Est_CCPs,tran_func,Data_Sim,init_states)

print('The model took '+str(time.time()-time1)+' seconds to simulate.')

The model took 0.0010519027709960938 seconds to simulate.


## Analyze Results from Simulation

Place the simulated data into a data frame to generate the statistics

In [27]:
Data_Sim = (pd.DataFrame(Data_Sim, columns=['Parent_State','Migrate','Educate','Child_State'])
              .reset_index()
              .rename(columns={'index':'Household'}) )

### Aggregate Statistics

Across whole Simulated dataset

In [28]:
print(Data_Sim.loc[:,['Migrate','Educate']].sum(axis=0)/len(Data_Sim))

Migrate    0.006584
Educate    0.515226
dtype: float64


Across the whole Empirical Dataset

In [29]:
print(Data.loc[:,['InterMarket_FamilyMig','Educ_3']].sum(axis=0)/len(Data))

InterMarket_FamilyMig    0.017284
Educ_3                   0.526200
dtype: float64


We see that unconditional means are well simulated

### By State (Person's Skill and Location)

From Simulation

In [30]:
function = {'Household':'count','Migrate':'mean', 'Educate':'mean'}

Sim_Stats = Data_Sim.groupby('Parent_State',as_index=True).agg(function)
Sim_Stats['Prop_in_State'] = Sim_Stats.Household/len(Data_Sim)

Merge with the original dataset for comparison

In [31]:
(Sim_Stats.loc[:,['Migrate','Educate']].round(3).merge(Stats.loc[:,['InterMarket_FamilyMig','Educ_3']].round(3),
                                              left_index=True,right_index=True,copy=False)
                                       .rename(index=index))

Unnamed: 0_level_0,Migrate,Educate,InterMarket_FamilyMig,Educ_3
Parent_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Low-Skilled, Everywhere Else",0.0,0.464,0.014,0.482
"Low-Skilled, Java",0.0,0.496,0.01,0.461
"High-Skilled, Everywhere Else",0.039,0.603,0.045,0.742
"High-Skilled, Java",0.021,0.641,0.03,0.694


#Simulation CCPs

#Group the data according to the states and then collapse the data along the desired dimensions.

#Calculate the first conditional probability P(Educ={0,1} | Move={0,1}, State=S)
Educ_DF = Data_Sim.groupby(['Parent_State','Migrate']).agg({'Educate':'mean'})
Educ_DF['No_Educ'] = 1 - Educ_DF.Educate
Educ_DF.sort_index(axis=1, ascending=False, inplace=True)

#Calculate the second conditional probability P(Move={0,1}|State=S)
Mig_DF = Data_Sim.groupby('Parent_State').agg({'Migrate':'mean'})
Mig_DF['No_Mig'] = 1 - Mig_DF.Migrate
Mig_DF.sort_index(axis=1, ascending=False, inplace=True)

#Get the underlying numpy array from the DataFrame, reshape to broadcast multiplication
Mig_Prob = Mig_DF.loc[:,['No_Mig','Migrate']].get_values().reshape((8,1))

Educ_Prob = Educ_DF.loc[:,['No_Educ','Educate']].get_values()

#Create the CCP DataFrame
Columns = ['NoEduc_NoMig','Educ_NoMig','NoEduc_Mig','Educ_Mig']

Sim_CCPs = pd.DataFrame(np.round(Educ_Prob*Mig_Prob,3).reshape((tot_states,tot_decisions)),columns=Columns)

Sim_CCPs

Unfortunately, conditional means on the decisions are note well modeled. Will need to work on this

### Next Generation States

From the simulation

In [32]:
Sim_Stats_Child = Data_Sim.groupby('Child_State',as_index=True).agg({'Household':'count'})
Sim_Stats_Child['Prop_in_State_Sim'] = Sim_Stats_Child.Household/len(Data_Sim)

From the data

In [33]:
Data['Child_State'] = Data.apply(lambda row: 
                                     State_Def(row['MarketCode_Move'],row['Skill_Level_2']), 
                                     axis=1)

Stats_Child = Data.groupby('Child_State').agg({'Household':'count'})
Stats_Child['Prop_in_State_Data'] = Stats_Child.Household/len(Data)

Merge restuls

In [34]:
(Sim_Stats_Child.merge(Stats_Child,left_index=True,
                       right_index=True,copy=False)
                .drop(['Household_x','Household_y'], inplace=False, axis=1)
                .rename(index=index))

Unnamed: 0_level_0,Prop_in_State_Sim,Prop_in_State_Data
Child_State,Unnamed: 1_level_1,Unnamed: 2_level_1
"Low-Skilled, Everywhere Else",0.221399,0.200823
"Low-Skilled, Java",0.263374,0.272977
"High-Skilled, Everywhere Else",0.213443,0.232099
"High-Skilled, Java",0.301783,0.294102


The model does capture the conditional distribution of states of the next generation (the children)

### Transition from Parent States to Child States

#### Flattened Transitions

Simulation Results

In [35]:
State_Tran_Sim = ( Data_Sim.groupby(['Parent_State','Child_State'],as_index=True)
                           .agg({'Household':'count'})
                           .rename(index=index) )

State_Tran_Sim['Prop_in_State_Sim'] = State_Tran_Sim.Household/len(Data_Sim)

Data Results

In [36]:
State_Tran_Data = (Data.groupby(['Parent_State','Child_State'],as_index=True)
                       .agg({'Household':'count'})
                       .rename(index=index) )

State_Tran_Data['Prop_in_State_Data'] = State_Tran_Data.Household/len(Data)

Merge the results

In [37]:
(State_Tran_Sim.merge(State_Tran_Data, how='outer', 
                      left_index=True, right_index=True, copy=False)
                .drop(['Household_x','Household_y'], axis=1, inplace=False)).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prop_in_State_Sim,Prop_in_State_Data
Parent_State,Child_State,Unnamed: 2_level_1,Unnamed: 3_level_1
"High-Skilled, Everywhere Else","High-Skilled, Everywhere Else",0.053,0.064
"High-Skilled, Everywhere Else","High-Skilled, Java",0.003,0.003
"High-Skilled, Everywhere Else","Low-Skilled, Everywhere Else",0.036,0.022
"High-Skilled, Everywhere Else","Low-Skilled, Java",0.001,0.001
"High-Skilled, Java","High-Skilled, Everywhere Else",0.001,0.003
"High-Skilled, Java","High-Skilled, Java",0.089,0.094
"High-Skilled, Java","Low-Skilled, Everywhere Else",0.002,0.001
"High-Skilled, Java","Low-Skilled, Java",0.049,0.041
"Low-Skilled, Everywhere Else","High-Skilled, Everywhere Else",0.159,0.163
"Low-Skilled, Everywhere Else","High-Skilled, Java",,0.003


#### Using Pandas Crosstabs
Repeat the above exercises but using cross tabs to create a comparison table

In [38]:
Data_Cross = pd.crosstab(Data.Parent_State,Data.Child_State,normalize='index').rename(index=index, columns=index)

In [39]:
Sim_Cross = pd.crosstab(Data_Sim.Parent_State,Data_Sim.Child_State,normalize='index').rename(index=index, columns=index)

In [40]:
Uncond_State_Tran = ( Sim_Cross.merge(Data_Cross,left_index=True, right_index=True,suffixes=('_Sim','_Data'))
                             .sort_index(axis=1, ascending=False)
                             .sort_index(axis=0, ascending=False) ).round(3)
del Data_Cross, Sim_Cross

Uncond_State_Tran

Child_State,"Low-Skilled, Java_Sim","Low-Skilled, Java_Data","Low-Skilled, Everywhere Else_Sim","Low-Skilled, Everywhere Else_Data","High-Skilled, Java_Sim","High-Skilled, Java_Data","High-Skilled, Everywhere Else_Sim","High-Skilled, Everywhere Else_Data"
Parent_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Low-Skilled, Java",0.504,0.536,0.0,0.003,0.496,0.455,0.0,0.006
"Low-Skilled, Everywhere Else",0.0,0.006,0.536,0.512,0.0,0.008,0.464,0.474
"High-Skilled, Java",0.348,0.296,0.012,0.01,0.631,0.675,0.01,0.02
"High-Skilled, Everywhere Else",0.009,0.009,0.388,0.248,0.03,0.036,0.573,0.706


The above DataFrame is the persistency between generations. We see that the model generally does will with the low-skilled cohorts in Generation G (the rows), but does not model well the incentive to invest in education among the high skilled cohorts. 

# Counterfactuals

## Reduction of migration and education cost - Affect channels with "sudsidies"

In [41]:
#Functions for Creating a dummy of parents education into Simulation Datasets

def Parent_Educ(State):
    
    if State==1 or State==2:
        Parent_Educ = 0
    else: Parent_Educ = 1
        
    return Parent_Educ

def Parent_Location(State):
    
    if State==1 or State==3: Location = 'Elsewhere'
    else: Location = 'Jawa'
        
    return Location


def Add_Info(In_Data):

    In_Data['Parent_Educ'] = In_Data.apply(lambda row: 
                                         Parent_Educ(row['Parent_State']), 
                                         axis=1)
    In_Data['Parent_Loc'] = In_Data.apply(lambda row: 
                                         Parent_Location(row['Parent_State']), 
                                         axis=1)
    
    return(In_Data)

In [42]:
#Function creating Statistics Datasets

def Stats_Datasets(In_Data):
    
    function = {'Household':'count','Migrate':'mean', 'Educate':'mean'}
    
    #Create the proportions datasets 
    Prop_Stats = In_Data.groupby(['Parent_State','Child_State'],as_index=True).agg({'Household':'count'})
    Prop_Stats['Prop_in_State_Sim'] = (Prop_Stats.Household/len(In_Data)).round(3)
    
    Rates_Stats = In_Data.groupby('Parent_State',as_index=True).agg(function)
    Rates_Stats = Rates_Stats.loc[:,['Migrate','Educate']].round(3)
    
    return(Prop_Stats,Rates_Stats)
    

In [43]:
#Function generating OLS regressions of the intergenerational transition matrix (persistency between generation states)

def Intergen_Mobil(In_Data):
    
    Mobility = ( sm.ols(formula="Educate ~ Parent_Educ", data=In_Data)
                   .fit(cov_type='cluster', cov_kwds={'groups': In_Data['Parent_Loc']}, use_t=True) )
    
    return Mobility
    

In [44]:
Mig_Datasets = {}
Mig_Prop_Datasets = {}
Mig_Rates_Datasets = {}
Mig_Educ_Mobility = {}

subsidy_mig = [0.3,0.2,0.1,0]

Educ_Datasets = {}
Educ_Prop_Datasets = {}
Educ_Rates_Datasets = {}
Educ_Educ_Mobility = {}

subsidy_educ = [0.75,0.5,0.25,0]

for reduct in subsidy_mig:
    
    Params = Param_Final.x.copy()

    Params[0] = Params[0]*reduct
    
    Counter_CCPs = CCP(Params)
    
    #Create Raw simulation datasets
    Sim_Counter = np.zeros((HH,4), dtype='uint16')
    
    Sim_Model_CCP(Counter_CCPs,tran_func,Sim_Counter,init_states)
    
    Sim_Counter = (pd.DataFrame(Sim_Counter, columns=['Parent_State','Migrate','Educate','Child_State'])
                  .reset_index()
                  .rename(columns={'index':'Household'}) )
    
    #Save
    Mig_Datasets[reduct] = Add_Info(Sim_Counter)
    
    #Stats Data
    Mig_Prop_Datasets[reduct], Mig_Rates_Datasets[reduct] = Stats_Datasets(Sim_Counter)
    
    #Run OLS Regressions and grab the paramater values
    Mig_Educ_Mobility[reduct] = Intergen_Mobil(Mig_Datasets[reduct])

for reduct in subsidy_educ:
    
    Params = Param_Final.x.copy()

    Params[1:3] = Params[1:3]*reduct
    
    Counter_CCPs = CCP(Params)
    
    #Create Raw simulation datasets
    Sim_Counter = np.zeros((HH,4), dtype='uint16')
    
    Sim_Model_CCP(Counter_CCPs,tran_func,Sim_Counter,init_states)
    
    Sim_Counter = (pd.DataFrame(Sim_Counter, columns=['Parent_State','Migrate','Educate','Child_State'])
                  .reset_index()
                  .rename(columns={'index':'Household'}) )
    
    #Save
    Educ_Datasets[reduct] = Add_Info(Sim_Counter)
    
    #Stats Data
    Educ_Prop_Datasets[reduct], Educ_Rates_Datasets[reduct] = Stats_Datasets(Sim_Counter)
    
    #Run OLS Regressions and grab the paramater values
    Educ_Educ_Mobility[reduct] = Intergen_Mobil(Educ_Datasets[reduct])

  from pandas.core import datetools


In [45]:
# Add the parental and child info based on states back into the dataset

Data_Sim = Add_Info(Data_Sim)

In [46]:
Mobility_Base_line = (sm.ols(formula="Educate ~ Parent_Educ", data=Data_Sim)
                             .fit(cov_type='cluster', cov_kwds={'groups': Data_Sim['Parent_Loc']}, use_t=True)) 

In [47]:
test_1 = summary_col([Mobility_Base_line]+ \
                     [v for k,v in Mig_Educ_Mobility.items()]+ \
                     [v for k,v in Educ_Educ_Mobility.items()],stars=False,float_format='%0.3f')


In [48]:
test_1

0,1,2,3,4,5,6,7,8,9
,Educate I,Educate II,Educate III,Educate IIII,Educate IIIII,Educate IIIIII,Educate IIIIIII,Educate IIIIIIII,Educate IIIIIIIII
Intercept,0.482,0.469,0.474,0.517,0.543,0.568,0.611,0.637,0.702
,(0.016),(0.007),(0.014),(0.005),(0.003),(0.004),(0.015),(0.008),(0.001)
Parent_Educ,0.144,0.203,0.196,0.156,0.158,0.071,0.077,0.063,0.006
,(0.002),(0.007),(0.015),(0.000),(0.014),(0.010),(0.007),(0.015),(0.012)


In [49]:
# Run the overall intergenerational correlations from the various datasets

Educ_Datasets[0][['Parent_Educ','Educate']].corr()

Unnamed: 0,Parent_Educ,Educate
Parent_Educ,1.0,0.005932
Educate,0.005932,1.0
