In [2]:
import numpy as np
import pandas as pd

## Question 7, Chapter 1

Likelihood function: <br>
$l(\beta;X_{1,1:n},X_{2,1:n},Y_{1:n}) = \prod_{i=1}^n[\frac{e^{x_i^T\beta}}{1+e^{x_i^T\beta}}]^{Y_i}[\frac{1}{1+e^{x_i^T\beta}}]^{1-Y_i}$ <br><br>

log-likelihood function: <br>
$ L(\beta) = \sum_{i=1}^n [Y_ix_i^T\beta - \log(1+e^{x_i^T\beta})]   $ <br><br>

Gradient of the log likelihood function: <br>
$ \nabla L(\beta) = \sum_{i=1}^n(Y_i-\frac{1}{1+e^{-x_i^T\beta}})x_i  $<br><br>

Hessian of the log likelihood function: <br>
$ \nabla^2 L(\beta) = - \sum_{i=1}^n\frac{e^{-x_i^T\beta}}{[1+e^{-x_i^T\beta}]^2}x_ix_i^T $

In [3]:
# Load the data
data = pd.read_csv("HW_chapter1_7.txt",delim_whitespace =True)

# Create X matrix and Y vector
X = data[['X1','X2']]
X['X0'] = 1
X = X.reindex(columns = ['X0','X1','X2']).to_numpy()
Y = data[['Y']].to_numpy()  # Note here we must make Y a 2-dimensional column vector for further calculation

# Gradient and Hessian using matrix multiplication
Gradient = lambda beta: (X.T)@(Y-1/(1 + np.exp(-X@(beta.reshape((X.shape[1],1))))))
def Hessian(beta):
    mid = np.exp(-X@(beta.reshape((X.shape[1],1)))) # Avoid repeated calculations
    Diagnal = np.diag((-mid/(1+mid)**2).flatten())
    Result = (X.T)@Diagnal@X
    
    return Result

In [4]:
# Newtons' method

def Newton(Grad, Hess, x0, tol):
    x0 = x0.reshape((3,1))
    while 1:
        x1 = x0 - ((np.linalg.inv(Hess(x0)))@(Grad(x0)))
        if np.linalg.norm(x1 - x0) <= tol:
            break
        else:
            x0 = x1
    return x1

beta_hat = Newton(Gradient, Hessian, np.zeros((1,3)),10**-10)
print('The MLE of beta:')
print(beta_hat)

The MLE of beta:
[[0.58579244]
 [1.38140903]
 [1.27957259]]


In [5]:
# Check
np.linalg.eigvals(Hessian(beta_hat))

# Showing it is negative definite (maximum)

array([ -5.61071468, -16.75400798, -13.46365762])