Problem 1

In [None]:
# Load libraries: numpy, pandas, and linear regression
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm

np.random.seed(5)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/ECON 630/data_assignment2.csv')
df.head()

Unnamed: 0,response,treatment,age,party
0,0,0,28.0,d
1,1,0,54.0,g
2,1,0,44.0,a
3,0,0,77.0,a
4,0,0,44.0,a


1. (20 points) We suspect there is treatment heterogeneity in age, with older participants (those
50 or older) differing from younger participants (49 and below). Estimate the treatment effect
on the elderly, the treatment effect on the young, and the difference between the groups; and
furthermore construct standard errors for all three estimates using the bootstrap with at least
2,000 iterations. You may ignore the political party covariate for this problem.

In [None]:
df['age_split'] = df.age >= 50.0
df

Unnamed: 0,response,treatment,age,party,age_split
0,0,0,28.0,d,False
1,1,0,54.0,g,True
2,1,0,44.0,a,False
3,0,0,77.0,a,True
4,0,0,44.0,a,False
...,...,...,...,...,...
36496,0,0,62.0,g,True
36497,1,0,66.0,g,True
36498,0,1,54.0,b,True
36499,0,0,57.0,d,True


In [None]:
df.age_split = df['age_split'].map({True:"o",False:"y"})

In [None]:
print(df)

       response  treatment   age party age_split
0             0          0  28.0     d         y
1             1          0  54.0     g         o
2             1          0  44.0     a         y
3             0          0  77.0     a         o
4             0          0  44.0     a         y
...         ...        ...   ...   ...       ...
36496         0          0  62.0     g         o
36497         1          0  66.0     g         o
36498         0          1  54.0     b         o
36499         0          0  57.0     d         o
36500         1          0  30.0     h         y

[36501 rows x 5 columns]


In [None]:
# Define a regression function
def regression(f, df):
  result = sm.ols(formula = f, data = df).fit()
  return(result.params[1])

# Get point estimate
point_estimate = regression('response ~ age_split + treatment + treatment * age_split', df)
print("Estimated effect of treatment is: " + str(round(point_estimate, 3)))

# Get summary on number of treated / control units
print("There are " + str(sum(df['treatment'])) + " treated units and " + 
      str(len(df['treatment']) - sum(df['treatment'])) + " control units")

Estimated effect of treatment is: 0.021
There are 19453 treated units and 17048 control units


In [None]:
#HTE topic 2 code
n = len(df)
# Measure three components over each iteration: the older effect, 
# the younger effect, and the difference in effects
bootstrap_older = []
bootstrap_younger = []
bootstrap_difference = []

# Remember: the sampling design was supposed to be 50-50 treatment-control, but older
# of course due to sampling error, it was slightly off. So we need to reweight
# when we do the Bootstrap iterations
weight_control = sum(df['treatment'] == 1)/n
weight_treatment = 1 - weight_control
probability = ((df['treatment'] == 1) *  weight_treatment + 
               (df['treatment'] == 0) * weight_control)
probability = probability/sum(probability)

# Bootstrap through 2000 iterations
# Note that 2000 is normally low, but we do this in the interest of time
for i in range(2000):

  # Sample observations (with the reweighting)
  index = np.random.choice(range(n), n, replace = True, p = probability)

  # Run the regression model. for eacf sample w/ the iloc function
  result = sm.ols(formula="response ~ age_split + treatment + treatment * age_split", 
                  data = df.iloc[index,]).fit()
  
  # This code is messy, but it basically extracts the two key coefficients: 
  # treatment effect, and treatment effect interacted with being young
  output = pd.DataFrame({'cols': result.params.index, 'par': result.params})
  sample_effect = output.iloc[np.where(output['cols'] == 'treatment')[0][0],1]
  sample_effect_younger_addon = output.iloc[np.where(output['cols'] == 'treatment:age_split[T.y]')[0][0],1]

  # Save the relevant indicators
  bootstrap_older.append(sample_effect)
  bootstrap_younger.append(sample_effect + sample_effect_younger_addon)
  bootstrap_difference.append(sample_effect_younger_addon)

  if i % 100 == 0:
    print("On iteration: " + str(i))


# Report summary statistics
print("The older std error is: " + 
      str(round(np.std(bootstrap_older), 4)))
print("The younger std error is: " + 
      str(round(np.std(bootstrap_younger), 4)))
print("The difference std error is: " + 
      str(round(np.std(bootstrap_difference), 4)))

# Remind ourselves of the actual treatment effects estimated from the regression
result = sm.ols(formula="response ~ age_split + treatment + treatment * age_split", 
                data = df).fit()
output = pd.DataFrame({'cols': result.params.index, 'par': result.params})
effect = output.iloc[np.where(output['cols'] == 'treatment')[0][0],1]
effect_younger_addon = output.iloc[np.where(output['cols'] == 'treatment:age_split[T.y]')[0][0],1]
print("The younger treatment effect is: " + str(round(effect, 4)))
print("The older treatment_effect is: " + 
      str(round(effect + effect_younger_addon, 4)))
print("The difference in treatment effects is: " + 
      str(round(effect_younger_addon, 4)))

On iteration: 0
On iteration: 100
On iteration: 200
On iteration: 300
On iteration: 400
On iteration: 500
On iteration: 600
On iteration: 700
On iteration: 800
On iteration: 900
On iteration: 1000
On iteration: 1100
On iteration: 1200
On iteration: 1300
On iteration: 1400
On iteration: 1500
On iteration: 1600
On iteration: 1700
On iteration: 1800
On iteration: 1900
The older std error is: 0.0069
The younger std error is: 0.0054
The difference std error is: 0.0088
The younger treatment effect is: -0.3164
The older treatment_effect is: -0.3514
The difference in treatment effects is: -0.035


In [None]:
#differnce in treatment effect of old vs young

2. Justify Answer on Standard Error choices

3. (20 points) We similarly suspect there is treatment heterogeneity in party preference. Prove
or disprove this hypothesis; but be mindful of the higher dimensionality of the problem
and implement a solution accordingly. You may ignore the age covariate for this problem.
Note that the standard errors should again be constructed by bootstrap with at least 2,000
iterations; but if you are unable to do so, you may use the default standard errors as reported
by linear regression for a five-point penalty only.

In [None]:
#stat sig a, b, g, f, and i

In [None]:
# Get point estimate
point_estimate3 = regression('response ~ party + treatment + treatment * party', df)
print("Estimated effect of treatment is: " + str(round(point_estimate3, 3)))

# Get summary on number of treated / control units
print("There are " + str(sum(df['treatment'])) + " treated units and " + 
      str(len(df['treatment']) - sum(df['treatment'])) + " control units")

Estimated effect of treatment is: 0.067
There are 19453 treated units and 17048 control units


In [None]:
n= len(df)
# Note that this isn't the Bootstrap: we are just splitting the sample
index = np.random.choice(range(n), round(n/2), replace = False)

In [None]:
# Run the regression model
result = sm.ols(formula="response ~ treatment + party + treatment*party", 
                data = df.drop(index)).fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               response   R-squared:                       0.184
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     242.1
Date:                Fri, 04 Nov 2022   Prob (F-statistic):               0.00
Time:                        15:54:48   Log-Likelihood:                -8731.8
No. Observations:               18251   AIC:                         1.750e+04
Df Residuals:                   18233   BIC:                         1.764e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept                0.2926 

In [None]:
# Measure three components over each iteration: the Old effect, 
# the Young effect, and the difference in effects
bootstrap_a = []
bootstrap_b = []
bootstrap_d = []
bootstrap_f = []
bootstrap_g = []
bootstrap_i = []
bootstrap_diff = []

# Remember: the sampling design was supposed to be 50-50 treatment-control, but 
# of course due to sampling error, it was slightly off. So we need to reweight
# when we do the Bootstrap iterations
weight_control = sum(df['treatment'] == 1)/n
weight_treatment = 1 - weight_control
probability = ((df['treatment'] == 1) *  weight_treatment + 
               (df['treatment'] == 0) * weight_control)
probability = probability/sum(probability)

# Bootstrap through 2000 iterations
for i in range(2000):

  # Sample 10,000 observations (with the reweighting)
  index = np.random.choice(range(n), n, replace = True, p = probability)

  # Run the regression model
  result = sm.ols(formula="response ~ treatment + party + treatment*party", 
                  data = df.iloc[index,]).fit()
  
  # This code is messy, but it basically extracts the two key coefficients: 
  # treatment effect, and treatment effect interacted with being young
  output = pd.DataFrame({'cols': result.params.index, 'par': result.params})
  sample_effect = output.iloc[np.where(output['cols'] == 'treatment')[0][0],1]
  sample_effect_b_add = output.iloc[np.where(output['cols'] == 'treatment:party[T.b]')[0][0],1]
  sample_effect_d_add = output.iloc[np.where(output['cols'] == 'treatment:party[T.d]')[0][0],1]
  sample_effect_f_add = output.iloc[np.where(output['cols'] == 'treatment:party[T.f]')[0][0],1]
  sample_effect_g_add = output.iloc[np.where(output['cols'] == 'treatment:party[T.b]')[0][0],1]
  sample_effect_i_add = output.iloc[np.where(output['cols'] == 'treatment:party[T.b]')[0][0],1]

  # Save the relevant indicators
  bootstrap_a.append(sample_effect)
  bootstrap_b.append(sample_effect + sample_effect_b_add)
  bootstrap_d.append(sample_effect + sample_effect_b_add + sample_effect_d_add)
  bootstrap_f.append(sample_effect + sample_effect_b_add + sample_effect_d_add+ sample_effect_f_add)
  bootstrap_g.append(sample_effect + sample_effect_b_add + sample_effect_d_add+ sample_effect_f_add + sample_effect_g_add)
  bootstrap_i.append(sample_effect + sample_effect_b_add + sample_effect_d_add+ sample_effect_f_add + sample_effect_g_add + sample_effect_i_add)
  bootstrap_diff.append(sample_effect_b_add + sample_effect_d_add+ sample_effect_f_add + sample_effect_g_add + sample_effect_i_add)

  if i % 100 == 0:
    print("On iteration: " + str(i))

On iteration: 0
On iteration: 100
On iteration: 200
On iteration: 300
On iteration: 400
On iteration: 500
On iteration: 600
On iteration: 700
On iteration: 800
On iteration: 900
On iteration: 1000
On iteration: 1100
On iteration: 1200
On iteration: 1300
On iteration: 1400
On iteration: 1500
On iteration: 1600
On iteration: 1700
On iteration: 1800
On iteration: 1900


In [None]:
# Report summary statistics
print("The party a std error is: " + 
      str(round(np.std(bootstrap_a), 4)))
print("The party b std error is: " + 
      str(round(np.std(bootstrap_b), 4)))
print("The party d std error is: " + 
      str(round(np.std(bootstrap_d), 4)))
print("The party f std error is: " + 
      str(round(np.std(bootstrap_f), 4)))
print("The party g std error is: " + 
      str(round(np.std(bootstrap_g), 4)))
print("The party i std error is: " + 
      str(round(np.std(bootstrap_i), 4)))

print("The difference std error is: " + 
      str(round(np.std(bootstrap_diff), 4)))

The party a std error is: 0.0097
The party b std error is: 0.009
The party d std error is: 0.017
The party f std error is: 0.0261
The party g std error is: 0.0373
The party i std error is: 0.0496
The difference std error is: 0.0575


Write a function that can compute the standard errors for the
mean of a data series via 1000 bootstrapped iterations.

Assume that the data is provided in batches, and each batch
must be deleted before the next batch is provided.

Test the function on a dataset that comes in five separate
batches, where each batch has 100,000 observations randomly
and uniformly distributed from 30 to 50; and report the
standard error.

The function to generate random Poisson values is
numpy.random.poisson(λ, n); and the function to generate
Uniform data for testing is numpy.random.uniform(30, 50, n).

In [None]:
def simulator(n_sims, n_batch, low, high, batch_size):
    #list stores the streaming sum and counts:
    summation = np.zeros(n_sims)
    count = np.zeros(n_sims)
    #repeat the main process of data-gen and bootstrapping
    for i in range(n_batch):
        data = np.random.uniform(low, high, batch_size)
    #repeat the bootstrapping process
    for i in range(n_sims):
        #generate random poisson dist for each iteration
        poisson = np.random.poisson(lam=1,size=len(data))
        #poisson*data
        m = np.multiply(poisson,data)
        #store the sum and counts
        summation[i] = summation[i] + np.sum(m)
        count[i] = count[i] + np.sum(poisson)
    return(summation, count)
summation, count

(array([0.        , 1.4924527 , 2.98514209, 5.97028418]),
 array([0., 1., 1., 2.]))

In [None]:
#number of sims
n_sims = int(1e3)
#details of the data
n_batch = 5
batch_size = int(1e5)
high = 50
low = 30
#run the simulation:
s,c = simulator(n_sims=n_sims, n_batch=n_batch, batch_size=batch_size,
                 low=low, high=high)

In [None]:
std_dev = np.std(s/c)
round(std_dev, 4)

0.0182