# Problem Set 4

### Paul G. Freed

In [294]:
#A number of imports are needed for both MLE and OLS measurements
import pandas as pd
import numpy as np
import scipy.optimize as opt
from scipy.optimize import minimize #to do the minimizing
import scipy.stats as stats
import statsmodels.api as sm #for OLS regressions

In [233]:
#Read in the data 
PS4 = pd.read_stata(r'\Users\MSB\Desktop\PS4_data.dta')


# Questions 1 & 2

In [284]:
#Question 1
#Here I select only the male heads of household who are between 25 and 60 and earn wages greater than $7 an hour

PS4 = PS4[(PS4["hsex"] == 1) 
                              & (PS4["age"] > 25) 
                              & (PS4["age"] < 60)
                              & (PS4["hlabinc"] / PS4["hannhrs"] > 7)]


In [287]:
#Question 2
#Here I create the nessecary variables in order to perform the MLE


#Creating Age Variables 
PS4['agesq'] = (PS4['age']**2)

#Creating Race Dummies
PS4['black'] = (PS4['hrace'] == 1 ).astype(int)
PS4['hispanic'] = (PS4['hrace'] == 5).astype(int) # There does not appear to be any hispanics in the data, but I make the variable anyway
PS4['other'] = ((PS4['hrace']!=7)&(PS4['hrace']!=6)).astype(int)

#Generating wage variables 
PS4['wages'] = PS4['hlabinc']/PS4['hannhrs']
PS4['ln_wages'] = np.log(PS4['wages'])

#Dropping missing values in wages and my vars, this seems to mess up the estimators
PS4 = PS4[pd.notnull(PS4['ln_wages'])
          &pd.notnull(PS4['hyrsed'])
          &pd.notnull(PS4['agesq'])
          &pd.notnull(PS4['hrace'])]


#Keeping only the years I want 
PS4_1971 = PS4.loc[PS4['year'] == 1971]
PS4_1980 = PS4.loc[PS4['year'] == 1980]
PS4_1990 = PS4.loc[PS4['year'] == 1990]
PS4_2000 = PS4.loc[PS4['year'] == 2000]


# Question 3

In [65]:
#For each of the years, I employ a MLE estimation, and then I employ an OLS linear regression to check the validity of my estimates

In [18]:
#Depending on the type of optimizer, the accuracy varies 

In [29]:
#1970s 

In each case, I write out the fuction I am estimating under "The Formula", with "Outcome" as ln(wages). It is the same formula for each, and the same one as written on the homework sheet

In [288]:
def MLElinear(params):
    Alpha = params[0]
    Beta1 = params[1]
    Beta2 = params[2]
    Beta3 = params[3]
    Beta4 = params[4]
    Beta5 = params[5]
    Beta6 = params[6]
    SD = params[7]
    
    #The Formula 
    Outcome = Alpha + Beta1 * PS4_1971['hyrsed'] + Beta2 * PS4_1971['age'] + Beta3 * PS4_1971['agesq'] + Beta4 * PS4_1971['black']  + Beta5 * PS4_1971['hispanic'] + Beta6 * PS4_1971['other']
    
    #The Fuction
    LL = -np.sum( stats.norm.logpdf(PS4_1971['ln_wages'], loc=Outcome, scale=SD))
    
    return(LL)

#My guesses for the parameters 
guesses = [0.31, 0.06, 0.06, 0, 0.23, 0, 0.094, 1]

#I employ both the SLSQP, LBFGSB, and the Nelder Mead method, but I find that the Nelder Mead method gives results much closer to the OLS estimates
results70s = opt.minimize(MLElinear, guesses, method='Nelder-Mead')

print(results70s)

 final_simplex: (array([[ 0.35135435,  0.06685428,  0.06512957, -0.00062826,  0.13005585,
        -0.00146773,  0.11286653,  0.40937977],
       [ 0.35133139,  0.06686052,  0.0651261 , -0.00062819,  0.13004446,
        -0.00146774,  0.11286795,  0.40936795],
       [ 0.35135981,  0.06685011,  0.06513227, -0.00062827,  0.12999583,
        -0.00146909,  0.11287437,  0.40938269],
       [ 0.35134055,  0.06685762,  0.06512717, -0.00062822,  0.13005888,
        -0.00146752,  0.11286598,  0.40938872],
       [ 0.35135329,  0.06685585,  0.06512596, -0.00062821,  0.13010414,
        -0.0014666 ,  0.1128607 ,  0.40938938],
       [ 0.35134106,  0.06685781,  0.06512715, -0.00062822,  0.13006065,
        -0.00146747,  0.11286552,  0.40938522],
       [ 0.35138627,  0.06684951,  0.06512916, -0.00062828,  0.13014828,
        -0.00146607,  0.1128558 ,  0.40938897],
       [ 0.35136141,  0.06685313,  0.06512899, -0.00062826,  0.13008404,
        -0.00146716,  0.11286317,  0.40938292],
       [ 0.3513

In [237]:
#Quick OLS Sanity Check

import pandas as pd
import statsmodels.formula.api as sm
result70sOLS = sm.ols(formula="ln_wages ~ hyrsed + age + agesq + black + hispanic + other", data=PS4_1971).fit()
print(result70sOLS.params)
    

Intercept    0.319711
hyrsed       0.066975
age          0.067488
agesq       -0.000656
black        0.225637
hispanic     0.000000
other        0.094074
dtype: float64


In [130]:
#

In [188]:
#1980s 

In [289]:
def MLElinear(params):
    Alpha = params[0]
    Beta1 = params[1]
    Beta2 = params[2]
    Beta3 = params[3]
    Beta4 = params[4]
    Beta5 = params[5]
    Beta6 = params[6]
    SD = params[7]
    
    #The Formula 
    Outcome = Alpha + Beta1 * PS4_1980['hyrsed'] + Beta2 * PS4_1980['age'] + Beta3 * PS4_1980['agesq'] + Beta4 * PS4_1980['black']  + Beta5 * PS4_1980['hispanic'] + Beta6 * PS4_1980['other']
    
    #The Fuction
    LL = -np.sum( stats.norm.logpdf(PS4_1980['ln_wages'], loc=Outcome, scale=SD))
    
    return(LL)

#My guesses for the parameters 
guesses = [0.31, 0.06, 0.06, 0, 0.23, 0, 0.094, 1]

#I employ both the SLSQP, LBFGSB, and the Nelder Mead method, but I find that the Nelder Mead method gives results much closer to the OLS estimates
results80s = opt.minimize(MLElinear, guesses, method='Nelder-Mead')

print(results80s)

 final_simplex: (array([[ 7.69898136e-01,  6.70448179e-02,  3.63164119e-02,
        -2.84645804e-04,  6.24557240e-02,  1.02331874e-02,
         3.34162238e-01,  4.49985340e-01],
       [ 7.69946016e-01,  6.70435828e-02,  3.63134570e-02,
        -2.84609584e-04,  6.24698491e-02,  1.02336380e-02,
         3.34170463e-01,  4.49984922e-01],
       [ 7.69973489e-01,  6.70468196e-02,  3.63098656e-02,
        -2.84565608e-04,  6.24712077e-02,  1.02341304e-02,
         3.34177335e-01,  4.49985016e-01],
       [ 7.69839982e-01,  6.70471289e-02,  3.63181218e-02,
        -2.84661245e-04,  6.24641506e-02,  1.02324884e-02,
         3.34139259e-01,  4.49985895e-01],
       [ 7.69920105e-01,  6.70448484e-02,  3.63147795e-02,
        -2.84623651e-04,  6.24562231e-02,  1.02334774e-02,
         3.34168939e-01,  4.49983615e-01],
       [ 7.69946110e-01,  6.70442312e-02,  3.63135175e-02,
        -2.84611440e-04,  6.24559801e-02,  1.02338053e-02,
         3.34177417e-01,  4.49981618e-01],
       [ 7.698722

In [190]:
#Quick OLS Sanity Check

import pandas as pd
import statsmodels.formula.api as sm
result80sOLS = sm.ols(formula="ln_wages ~ hyrsed + age + agesq + black + hispanic + other", data=PS4_1980).fit()
print(result80sOLS.params)
 

Intercept    0.756720
hyrsed       0.067038
age          0.036328
agesq       -0.000285
black        0.409655
hispanic     0.000000
other        0.347065
dtype: float64


In [191]:
#

In [192]:
#1990s


In [290]:
def MLElinear(params):
    Alpha = params[0]
    Beta1 = params[1]
    Beta2 = params[2]
    Beta3 = params[3]
    Beta4 = params[4]
    Beta5 = params[5]
    Beta6 = params[6]
    SD = params[7]
    
    #The Formula 
    Outcome = Alpha + Beta1 * PS4_1990['hyrsed'] + Beta2 * PS4_1990['age'] + Beta3 * PS4_1990['agesq'] + Beta4 * PS4_1990['black']  + Beta5 * PS4_1990['hispanic'] + Beta6 * PS4_1990['other']
    
    #The Fuction
    LL = -np.sum( stats.norm.logpdf(PS4_1990['ln_wages'], loc=Outcome, scale=SD))
    
    return(LL)

#My guesses for the parameters 
guesses = [0.31, 0.06, 0.06, 0, 0.23, 0, 0.094, 1]

#I employ both the SLSQP, LBFGSB, and the Nelder Mead method, but I find that the Nelder Mead method gives results much closer to the OLS estimates
results90s = opt.minimize(MLElinear, guesses, method='Nelder-Mead')

print(results90s)

 final_simplex: (array([[ 2.62749835e-01,  9.80586465e-02,  4.27181531e-02,
        -3.47126362e-04,  1.38192132e-01,  1.66404023e-03,
         1.27187169e-01,  4.79877970e-01],
       [ 2.62755724e-01,  9.80657743e-02,  4.27130570e-02,
        -3.47072331e-04,  1.38205262e-01,  1.66309659e-03,
         1.27184210e-01,  4.79894780e-01],
       [ 2.62743599e-01,  9.80656312e-02,  4.27139574e-02,
        -3.47085104e-04,  1.38178974e-01,  1.66361937e-03,
         1.27192249e-01,  4.79895389e-01],
       [ 2.62758354e-01,  9.80554126e-02,  4.27194109e-02,
        -3.47141222e-04,  1.38207886e-01,  1.66393668e-03,
         1.27182536e-01,  4.79884136e-01],
       [ 2.62760679e-01,  9.80565507e-02,  4.27165556e-02,
        -3.47085436e-04,  1.38217591e-01,  1.66356105e-03,
         1.27181373e-01,  4.79894732e-01],
       [ 2.62766507e-01,  9.80569380e-02,  4.27185162e-02,
        -3.47139231e-04,  1.38225161e-01,  1.66349619e-03,
         1.27176839e-01,  4.79885217e-01],
       [ 2.627882

In [196]:
#Quick OLS Sanity Check

import pandas as pd
import statsmodels.formula.api as sm
result90sOLS = sm.ols(formula="ln_wages ~ hyrsed + age + agesq + black + hispanic + other", data=PS4_1990).fit()
print(result90sOLS.params)

Intercept    0.245714
hyrsed       0.098398
age          0.046937
agesq       -0.000397
black        0.193006
hispanic     0.000000
other        0.052708
dtype: float64


In [292]:
#2000s

PS4_2000 = PS4_2000[(PS4_2000["ln_wages"] < 1000)]

#For some reason there was a infinite value for the year 2000, so I had to get rid of it

In [272]:
#2000s


In [293]:


def MLElinear(params):
    Alpha = params[0]
    Beta1 = params[1]
    Beta2 = params[2]
    Beta3 = params[3]
    Beta4 = params[4]
    Beta5 = params[5]
    Beta6 = params[6]
    SD = params[7]
    
    #The Formula 
    Outcome = Alpha + Beta1 * PS4_2000['hyrsed'] + Beta2 * PS4_2000['age'] + Beta3 * PS4_2000['agesq'] + Beta4 * PS4_2000['black']  + Beta5 * PS4_2000['hispanic'] + Beta6 * PS4_2000['other']
    
    #The Fuction
    LL = -np.sum( stats.norm.logpdf(PS4_2000['ln_wages'], loc=Outcome, scale=SD))
    
    return(LL)

#My guesses for the parameters 
guesses = [-0.351, 0.11, 0.09, -0.0009, -0.0833, 0, -0.267, 1]

#I employ both the SLSQP, LBFGSB, and the Nelder Mead method, but I find that the Nelder Mead method gives results much closer to the OLS estimates
results00s = opt.minimize(MLElinear, guesses, method='Nelder-Mead')

print(results00s)

 final_simplex: (array([[ 5.47190627e-01,  1.11965811e-01,  9.00201804e-02,
        -9.52194204e-04,  1.84105112e-01,  1.94553417e-02,
        -1.16618762e+00,  5.36247420e-01],
       [ 5.47162992e-01,  1.11965643e-01,  9.00206546e-02,
        -9.52195555e-04,  1.84100228e-01,  1.94547244e-02,
        -1.16617053e+00,  5.36246595e-01],
       [ 5.47166896e-01,  1.11966458e-01,  9.00210792e-02,
        -9.52207226e-04,  1.84107231e-01,  1.94546904e-02,
        -1.16618866e+00,  5.36247483e-01],
       [ 5.47257996e-01,  1.11967664e-01,  9.00160032e-02,
        -9.52142684e-04,  1.84104505e-01,  1.94571371e-02,
        -1.16619907e+00,  5.36248390e-01],
       [ 5.47224957e-01,  1.11964923e-01,  9.00188800e-02,
        -9.52174467e-04,  1.84102427e-01,  1.94562773e-02,
        -1.16618711e+00,  5.36247826e-01],
       [ 5.47191377e-01,  1.11965277e-01,  9.00200898e-02,
        -9.52190556e-04,  1.84102747e-01,  1.94554054e-02,
        -1.16618174e+00,  5.36248061e-01],
       [ 5.471760

In [277]:
#Quick OLS Sanity Check

import pandas as pd
import statsmodels.formula.api as sm
result00sOLS = sm.ols(formula="ln_wages ~ hyrsed + age + agesq + black + hispanic + other", data=PS4_2000).fit()
print(result00sOLS.params)

#Pretty close

Intercept   -0.351091
hyrsed       0.111955
age          0.090008
agesq       -0.000952
black       -0.083393
hispanic     0.000000
other       -0.267698
dtype: float64


In [280]:
#

In [281]:
#


# Question 4

#### I find in 1971 the estimate is 0.067, in 1980 the estimate is about 0.067, in 1990 the estimate is about 0.098, and in 2000 the estimate is about 0.11. This suggests that in 1971, a one year increase in education is associated with about a 6.7% increase in wages, while in 2000 a one year increase in education corresponds to a 11% increase in wages. Overall, this suggests the role of education in determining wages is increasing over time, which has possible implications for rising inequality in the United States.