In [3]:
import numpy as np
import pandas as pd

## Question 1 

### a)

$ p $ is just a constant. Therefore the variance of $ \hat{p} - p$ is the same as the variance of $ \hat{p} $.

In [53]:
# Define the original Sample
X = np.zeros(100)
X[0:24] = 1

# Bootstrap Sample maker
def Bootstrap(X, nRepeats):
    n = X.size
    Samples = []
    for i in range(nRepeats):
        mask = np.random.randint(n, size = X.shape)
        Samples.append(X[mask])
    return Samples

# Draw 10000 samples by bootstrapping
N = 10000
Samples = Bootstrap(X, N)

# Get the sample of p_hat
Means = np.array(list(map(lambda x: x.mean(), Samples)))

# Get the sample variance as the estimator of Var(p_hat)
print('The estimated variance is')
print(Means.var())
print('The true variance is')
print(0.24*0.76/100)

The estimated variance is
0.0018268773589999998
The true variance is
0.001824


### b)

In [54]:
# Sort the array
SortedMeans = sorted(Means)
# Calculate empirical quantiles
low = SortedMeans[int(N*0.025)]
high = SortedMeans[int(N*0.975//1)]
# Obtain the final result
lowerBound = 2*0.24 - high
higherBound = 2*0.24 - low
print('The 95% CI is:')
print('[%f,%f]' %(lowerBound, higherBound))

The 95% CI is:
[0.160000,0.320000]


## Question 2

In [34]:
# Given Data, return the MLE as a vector
def sigmoid(x):
    return 1/(1+np.exp(-x))
    
def MLE_logit(X, y, learningRate):
    X = X.copy()
    # Add a column of ones to X
    m = X.shape[0]
    X = np.hstack([np.ones((m,1)),X])
    p = X.shape[1]
    epsilon = 10**-12
    # Initialization of parameters
    beta = np.zeros((p,1))
    
    # Calculate the predicted value
    while(1):
        y_hat = sigmoid(X@beta)
        gradient = X.T@(y - y_hat)
        beta1 = beta + learningRate*gradient
        if (beta1 - beta).T @ (beta1-beta) < epsilon:
            y_hat = sigmoid(X@beta1)
            return beta1, y_hat
        beta = beta1

In [35]:
# Load the data
Data = pd.read_csv('HW_chapter6_2.txt', delim_whitespace=True)
y = Data['Y'].to_numpy().reshape((-1,1))
X = Data[['X1','X2']].to_numpy()

# Calculate the correct MLE 
beta_MLE, prob = MLE_logit(X, y , 0.01)
print('The MLE of beta is')
print(beta_MLE)

The MLE of beta is
[[1.62014168]
 [0.05334902]
 [2.00558986]]


In [48]:
# Bootstrapping one sample y from the orginal sample
def Bootstrap_logit(prob):
    m = prob.shape[0]
    y = np.zeros(prob.shape)
    u = np.random.random(prob.shape)
    y[u < prob] = 1
    return y
# Repeat for 1000 times and get the MLE for each sample
beta0 = []
beta1 = []
beta2 = []
for i in range(1000):
    y_tilda = Bootstrap_logit(prob)
    beta_new = MLE_logit(X, y_tilda, 0.01)[0]
    beta0.append(beta_new[0,0])
    beta1.append(beta_new[1,0])
    beta2.append(beta_new[2,0])

In [61]:
# from list to  CI
def CI(List, yhat):
    SortedList = sorted(List)
    N = len(SortedList)
    # Calculate empirical quantiles
    low = SortedList[int(N*0.025)]
    high = SortedList[int(N*0.975)]
    # Obtain the final result
    lowerBound = 2*yhat - high
    higherBound = 2*yhat - low
    print('The 95% CI is:')
    print('[%f,%f]' %(lowerBound, higherBound))
print('beta0')    
CI(beta0,beta_MLE[0,0])
print('beta1') 
CI(beta1,beta_MLE[1,0])
print('beta2')
CI(beta2,beta_MLE[2,0])

beta0
The 95% CI is:
[0.567752,2.230890]
beta1
The 95% CI is:
[-0.524007,0.624939]
beta2
The 95% CI is:
[0.781464,2.638388]
