In [1]:
# Nora Nickels
# MACS POCM Winter 2018
# HW 2

import numpy as np
import scipy.stats as sts
import matplotlib.pyplot as plt

# This next command is specifically for Jupyter Notebook
%matplotlib notebook

# 1. Some income data, lognormal distribution, and hypothesis testing (6 points)

# Load incomes data
incomes = np.loadtxt('incomes.txt')

In [2]:
# a. Plot histogram 
count, bins, ignored = plt.hist(incomes, 30, edgecolor='black', normed=True)
plt.title('CSS Annual Incomes', fontsize=20)
plt.xlabel('U.S. Dollars')
plt.ylabel('Percent of Income Observations')
plt.xlim([0, 150000])  # This gives the xmin and xmax to be plotted"
# Don't set this figure until have played with the figure below

<IPython.core.display.Javascript object>

(0, 150000)

In [3]:
# b. Plot lognormal PDF and generate log likelihood value
# Define function that generates values of a potentially lognormal
# probability density function (PDF)
def lognorm_pdf(xvals, mu, sigma, cutoff):
    '''
    --------------------------------------------------------------------
    Generate pdf values from the truncated normal pdf with mean mu and
    standard deviation sigma. If the cutoff is finite, then the PDF
    values are inflated upward to reflect the zero probability on values
    above the cutoff. If there is no cutoff given or if it is given as
    infinity, this function does the same thing as
    sp.stats.norm.pdf(x, loc=mu, scale=sigma).
    --------------------------------------------------------------------
    INPUTS:
    xvals  = (N,) vector, values of the normally distributed random
             variable
    mu     = scalar, mean of the normally distributed random variable
    sigma  = scalar > 0, standard deviation of the normally distributed
             random variable
    cutoff = scalar or string, ='None' if no cutoff is given, otherwise
             is scalar upper bound value of distribution. Values above
             this value have zero probability
    
    OTHER FUNCTIONS AND FILES CALLED BY THIS FUNCTION: None
    
    OBJECTS CREATED WITHIN FUNCTION:
    prob_notcut = scalar 
    pdf_vals = (N,) vector, normal PDF values for mu and sigma
               corresponding to xvals data
    
    FILES CREATED BY THIS FUNCTION: None
    
    RETURNS: pdf_vals
    --------------------------------------------------------------------
    '''
    if cutoff == 'None':
        prob_notcut = 1.0 - sts.lognorm.cdf(0, loc=mu, s=sigma, scale=1)
    else:
        prob_notcut = (sts.lognorm.cdf(cutoff, loc=mu, s=sigma, scale=1) -
                       sts.lognorm.cdf(0, loc=mu, s=sigma, scale=1))
            
    pdf_vals    = ((1/(xvals * sigma * np.sqrt(2 * np.pi)) *
                    np.exp( - (np.log(xvals) - mu)**2 / (2 * sigma**2))) /
                    prob_notcut)
    
    return pdf_vals

dist_inc = np.linspace(1e-10, 150000, 200)
mu_1 = 11
sig_1 = 0.5

plt.figure()
count, bins, ignored = plt.hist(incomes, 30, edgecolor='black', normed=True)
plt.title('CSS Annual Incomes', fontsize=20)
plt.xlabel('U.S. Dollars')
plt.ylabel('Percent of Income Observations')
plt.xlim([0, 150000])
plt.plot(dist_inc, lognorm_pdf(dist_inc, mu_1, sig_1, 'None'),
         linewidth=2, color='r', label='1: $\mu$=11,$\sigma$=0.5')
plt.legend(loc='upper left')
plt.show()

<IPython.core.display.Javascript object>

In [4]:
# b. continued
# Define log likelihood function for the lognormal distribution
def log_lik_lognorm(xvals, mu, sigma, cutoff):
    pdf_vals = lognorm_pdf(xvals, mu, sigma, cutoff)
    ln_pdf_vals = np.log(pdf_vals)
    log_lik_val = ln_pdf_vals.sum()
    
    return log_lik_val

print('Log-likelihood 1: ', log_lik_lognorm(incomes, mu_1, sig_1, 'None'))

Log-likelihood 1:  -2385.85699781


In [11]:
# c. Estimate the parameters of the lognormal distribution by MLE and plot PDF against PDF and histogram from parts a and b.

def crit(params, *args):
    mu, sigma = params
    xvals, cutoff = args
    log_lik_val = log_lik_lognorm(xvals, mu, sigma, cutoff)
    neg_log_lik_val = -log_lik_val
    
    return neg_log_lik_val

In [12]:
# Report the ML estimates for mu and sigma.

import scipy.optimize as opt

mu_init = 11  # mu_1
sig_init = 0.2  # sig_1
params_init = np.array([mu_init, sig_init])
mle_args = (incomes, 150000)
results = opt.minimize(crit, params_init, args=(mle_args), method='L-BFGS-B',
                       bounds=((0.1, None), (0.1, None)))
mu_MLE, sig_MLE = results.x
print('mu_MLE=', mu_MLE, ' sig_MLE=', sig_MLE)
results

mu_MLE= 11.3590229932  sig_MLE= 0.208177322386


      fun: 2241.7193013573587
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>
      jac: array([ -4.54747351e-05,  -4.54747351e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 48
      nit: 7
   status: 0
  success: True
        x: array([ 11.35902299,   0.20817732])

In [13]:
# Plot the PDF against PDF and histogram from part a and b. 

plt.figure()

count, bins, ignored = plt.hist(incomes, 30, edgecolor='black', normed=True)
plt.title('CSS Annual Incomes', fontsize=20)
plt.xlabel('U.S. Dollars')
plt.ylabel('Percent of Income Observations')
plt.xlim([0, 150000])
plt.plot(dist_inc, lognorm_pdf(dist_inc, mu_1, sig_1, 'None'),
         linewidth=2, color='r', label='1: $\mu$=11,$\sigma$=0.5')
plt.plot(dist_inc, lognorm_pdf(dist_inc, mu_MLE, sig_MLE, 150000),
         linewidth=2, color='g', label='2: $\mu$=11.359,$\sigma$=0.208')
plt.legend(loc='upper left')

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x11a6c0e10>

In [14]:
# Report the value of likelihood.

print('Log-likelihood 1: ', log_lik_lognorm(incomes, mu_1, sig_1, 150000))
print('MLE log-likelihood 3: ', log_lik_lognorm(incomes, mu_MLE, sig_MLE, 150000))

Log-likelihood 1:  -2385.85699781
MLE log-likelihood 3:  -2241.71930136


In [15]:
# Variance covariance matrix.

results
OffDiagNeg = np.array([[1, -1], [-1, 1]])
vcv_mle = results.hess_inv * OffDiagNeg
stderr_mu_mle = np.sqrt(vcv_mle[0,0])
stderr_sig_mle = np.sqrt(vcv_mle[1,1])
print('VCV(MLE) = ', vcv_mle)
print('Standard error for mu estimate = ', stderr_mu_mle)
print('Standard error for sigma estimate = ', stderr_sig_mle)

VCV(MLE) =  [[ 0.00021758 -0.00021758]
 [-0.00010926  0.00010926]]
Standard error for mu estimate =  0.0147504340304
Standard error for sigma estimate =  0.0104529545672


In [16]:
# d. Perform a likelihood ratio test. 

log_lik_h0 = log_lik_lognorm(incomes, mu_1, sig_1, 150000)
log_lik_mle = log_lik_lognorm(incomes, mu_MLE, sig_MLE, 150000)
LR_val = 2 * (log_lik_mle - log_lik_h0)
pval_h0 = 1.0 - sts.chi2.cdf(LR_val, 2)
print('chi squared of H0 with 2 degrees of freedom p-value = ', pval_h0)
print(LR_val)

chi squared of H0 with 2 degrees of freedom p-value =  0.0
288.275392902


In [17]:
# e. What's the probability I'll earn more than $100k? What's the probability I'll earn less than $75k?

from scipy.integrate import quad
above_100000 = quad(lognorm_pdf, 100000, 150000, args = (mu_MLE, sig_MLE, 150000))
below_75000 = quad(lognorm_pdf, 1e-10, 75000, args = (mu_MLE, sig_MLE, 150000))
print("Probability", above_100000[0])
print("Probability", below_75000[0])

Probability 0.22626173118490298
Probability 0.260234275910117


In [None]:
# Variance Covariance matrix. basically same exact code as above.

# results
# OffDiagNeg = np.eye(5)*2-1
# vcv_mle = results.hess_inv.todense() * OffDiagNeg
# print('VCV(MLE) = ', vcv_mle)

In [None]:
# b. Likelihood ratio test. also basically same exact code as part 1.

# b0, b1, b2, b3, sigma = (0.1, 0.1, 0.1, 0.1, 0.1)
# params_init = np.array([b0, b1, b2, b3, sigma])

# log_lik_h0 = log_lik_value(params_init, variables, n)
# log_lik_mle = log_lik_value(results.x, variables, n)
# LR_val = 2 * (log_lik_mle - log_lik_h0)
# pval_h0 = 1.0 - sts.chi2.cdf(LR_val, 5)
# print('chi squared of H0 with 2 degrees of freedom p-value = ', pval_h0)
# print(LR_val)

In [None]:
# Chi squared p value is essentially zero so variables do have an effect on sick days. How do I write text without annotating with a hashtag ...

import numpy as np
import scipy.stats as sts
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
from matplotlib.ticker import MultipleLocator

sick_data = pd.read_csv("sick.txt")
list(sick_data)

variables = sick, age, children, temp
n = len(sickdata)
print(n)

def get_error(sick, age, children, temp_winter, b0, b1, b2, b3):
    error_val = sick- b0 - b1 * age - b2 * children - b3 * temp_winter
    return error_val

def loglik_norm(sick, age, children, temp_winter, b0, b1, b2, b3, sigma):
    error_val = get_error(sick, age, children, temp_winter, b0, b1, b2, b3)
    N = len(error_val)
    log_lik_val = (-N/2) * np.log(2 * np.pi)-(N/2)*np.log(sigma**2)-\
                  (1/(2*sigma**2)) * np.sum(error_val**2)  
    return log_lik_val 

def crit_2(params,*args):
    b0, b1, b2, b3, sigma = params
    sick, age, children, temp_winter = args
    log_lik_val = loglik_norm(sick, age, children, temp_winter, b0, b1, b2, b3, sigma)
    
    return -log_lik_val

import numpy as np
import scipy.stats as sts
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
from matplotlib.ticker import MultipleLocator

sick_data = pd.read_csv("sick.txt")
list(sick_data)

rename variables
sick_data.rename(columns={'\ufeffsick': 'sick'}, inplace=True)
sick_data.rename(columns={'avgtemp_winter': 'temp'}, inplace=True)
sick_data.head()

def get_error(sick, age, children, temp, b0, b1, b2, b3):
    error_val = sick - b0 - b1 * age - b2 * children - b3 * temp
    return error_val

def loglik_norm(sick, age, children, temp, b0, b1, b2, b3, sigma):
    error_val = get_error(sick, age, children, temp, b0, b1, b2, b3)
    N = len(error_val)
    log_lik_val = (-N/2) * np.log(2 * np.pi)-(N/2)*np.log(sigma**2)-\
                  (1/(2*sigma**2)) * np.sum(error_val**2)  
    return log_lik_val 

def crit_2(params,*args):
    b0, b1, b2, b3, sigma = params
    sick, age, children, temp = args
    log_lik_val = loglik_norm(sick, age, children, temp, b0, b1, b2, b3, sigma)
    
    return -log_lik_val

params_init =  np.array([1,0,0,0,0.1])
mle_args = (sick, age, children, temp)
results2 = opt.minimize(crit_2, params_init, args=(mle_args), method = 'L-BFGS-B', 
                       bounds = ((None, None), (None, None), (None, None), \
                                 (None, None), (1e-100, None)))
b02, b12, b22, b32, sigma2 = results2.x
loglik2 = loglik_norm(sick, age, children, temp, b02, b12, b22, b32, sigma2)
print ('The  are', results2.x, 'for b0, b1, b2, b3 and sigma')
print ('The log likelihood is', loglik2)

In [None]:
# Problem 2, completed.

In [18]:
import numpy as np
import scipy.stats as sts
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
from matplotlib.ticker import MultipleLocator

sick_data = pd.read_csv("sick.txt")
# rename variables
sick_data.rename(columns={'\ufeffsick': 'sick'}, inplace=True)
sick_data.rename(columns={'avgtemp_winter': 'temp'}, inplace=True)
sick_data.head()

Unnamed: 0,sick,age,children,temp
0,1.67,57.47,3.04,54.1
1,0.71,26.77,1.2,36.54
2,1.39,41.85,2.31,32.38
3,1.37,51.27,2.46,52.94
4,1.45,44.22,2.72,45.9


In [19]:
def get_error(sick, age, children, temp, b0, b1, b2, b3):
    error_val = sick- b0 - b1 * age - b2 * children - b3 * temp
    return error_val

def loglik_norm(sick, age, children, temp, b0, b1, b2, b3, sigma):
    error_val = get_error(sick, age, children, temp, b0, b1, b2, b3)
    N = len(error_val)
    log_lik_val = (-N/2) * np.log(2 * np.pi)-(N/2)*np.log(sigma**2)-\
                  (1/(2*sigma**2)) * np.sum(error_val**2)  
    return log_lik_val 

def crit_2(params,*args):
    b0, b1, b2, b3, sigma = params
    sick, age, children, temp = args
    log_lik_val = loglik_norm(sick, age, children, temp, b0, b1, b2, b3, sigma)
    
    return -log_lik_val

In [20]:
sick = sick_data.sick
age = sick_data.age
children = sick_data.children
temp = sick_data.temp
params_init =  np.array([1,0,0,0,0.1])
mle_args = (sick, age, children, temp)

results2 = opt.minimize(crit_2, params_init, args=(mle_args), method = 'L-BFGS-B', 
                       bounds = ((None, None), (None, None), (None, None), \
                                 (None, None), (1e-100, None)))
b02, b12, b22, b32, sigma2 = results2.x
loglik2 = loglik_norm(sick, age, children, temp, b02, b12, b22, b32, sigma2)
print ('The  are', results2.x, 'for b0, b1, b2, b3 and sigma')
print ('The log likelihood is', loglik2)

The  are [ 0.25164474  0.01293344  0.40050126 -0.00999169  0.00301766] for b0, b1, b2, b3 and sigma
The log likelihood is 876.865063044


In [21]:
OffDiagNeg2 = np.eye(5)*2-1
vcv2 =results2.hess_inv.todense() * OffDiagNeg2
print('VCV(MLE) = ', vcv2)

VCV(MLE) =  [[  4.78307511e+02   3.34808215e+01  -4.57002084e+02  -1.62711922e+00
   -1.06072029e+02]
 [  3.34808215e+01   2.41782273e+00   3.29824240e+01   1.45410683e-01
    7.49926624e+00]
 [ -4.57002084e+02   3.29824240e+01   4.49930947e+02  -1.97628763e+00
   -1.02342394e+02]
 [ -1.62711922e+00   1.45410683e-01  -1.97628763e+00   1.89178932e-02
   -3.92423814e-01]
 [ -1.06072029e+02   7.49926624e+00  -1.02342394e+02  -3.92423814e-01
    2.35976466e+01]]


In [22]:
results2

      fun: -876.86506304365412
 hess_inv: <5x5 LbfgsInvHessProduct with dtype=float64>
      jac: array([ -0.78488256, -35.56260708,  -1.23440032, -38.11736633,  -1.26303803])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 690
      nit: 65
   status: 0
  success: True
        x: array([ 0.25164474,  0.01293344,  0.40050126, -0.00999169,  0.00301766])

In [23]:
log_lik_h02 = loglik_norm(sick, age, children, temp, 1,0,0,0,0.1)
log_lik_mle2 = loglik_norm(sick, age, children, temp, b02, b12, b22, b32, sigma2)
LR_val_2 = 2 * (log_lik_mle2 - log_lik_h02)
pval_h0 = 1.0 - sts.chi2.cdf(LR_val_2, 5)
print('chi squared of H0 with 5 degrees of freedom p-value = ', pval_h0, ', h0 is rejected')
print(LR_val_2)

chi squared of H0 with 5 degrees of freedom p-value =  0.0 , h0 is rejected
6261.13150217
