Examples of how to load/process data, run linear/logistic regressions, show results, and tune

In [None]:
# Load Packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import copy
import sklearn.preprocessing
from sklearn.linear_model import LogisticRegression

spam = pd.read_table('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/spam.data', 
                   delim_whitespace=True, header = None)
test_indicator = pd.read_table('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/spam.traintest',
                         delim_whitespace=True, header = None)

In [None]:
x = np.asarray(spam)[:, 0:-1]
y = np.asarray(spam)[:, -1]*2 - 1
test_indicator = np.array(test_indicator).T[0]

# Divide the data into train, test sets
x_train = x[test_indicator == 0, :]
x_test = x[test_indicator == 1, :]
y_train = y[test_indicator == 0]
y_test = y[test_indicator == 1]

# Standardize the data.
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Keep track of the number of samples and dimension of each sample
n_train = len(y_train)
n_test = len(y_test)
d = np.size(x, 1)

In [None]:
# gradient of linear objective function
def computegrad(beta, lambduh, x = x_train, y = y_train):
    return -2/len(y)*x.T.dot(y-np.dot(x, beta)) + 2*lambduh*beta

# constant step-size gradient descent
def graddescent(beta_init, eta, lambduh, max_iter=1000):
    beta = beta_init
    grad_beta = computegrad(beta, lambduh)
    beta_vals = [beta]
    iter = 0
    while iter < max_iter:
        beta = beta - eta*grad_beta
        beta_vals.append(beta)
        grad_beta = computegrad(beta, lambduh)
        iter += 1
    return np.array(beta_vals)

# linear objective
def obj(beta, lambduh, x = x_train, y = y_train):
    return 1/len(y)*sum((y-x.dot(beta))**2) + lambduh*np.linalg.norm(beta)**2

In [None]:
def convergence_plots(x_vals, lambduh):
    """
    Plot the convergence in terms of the function values and the gradients
    Input:
        - x_vals: Values the gradient descent algorithm stepped to
    """
    n, d = x_vals.shape
    fs = np.zeros(n)
    grads = np.zeros((n, d))
    for i in range(n):
        fs[i] = obj(x_vals[i], lambduh)
        grads[i, :] = computegrad(x_vals[i], lambduh)
    grad_norms = np.linalg.norm(grads, axis=1)
    plt.subplot(121)
    plt.plot(fs)
    plt.xlabel('Iteration')
    plt.ylabel('Objective value')
    plt.subplot(122)
    plt.plot(grad_norms)
    plt.xlabel('Iteration')
    plt.ylabel('Norm of gradient')
    plt.suptitle('Function Value and Norm of Gradient Convergence', fontsize=16)
    plt.subplots_adjust(left=0.2, wspace=0.8, top=0.8)
    plt.show()

In [None]:
eta = 0.05
max_iter = 1000
lambduh = 0.05
d = x_train.shape[1]
beta_init = np.random.normal(size=d)
betas = graddescent(beta_init, eta, lambduh, max_iter=1000)
convergence_plots(betas, lambduh)

In [None]:
# logistic
def computegrad(beta, lambduh, x, y):
    yx = y[:, np.newaxis]*x
    denom = 1+np.exp(-yx.dot(beta))
    grad = 1/len(y)*np.sum(-yx*np.exp(-yx.dot(beta[:, np.newaxis]))/
        denom[:, np.newaxis], axis=0) + 2*lambduh*beta
    return grad

# logistic
def objective(beta, lambduh, x, y):
    return 1/len(y) * np.sum(np.log(1 + np.exp(-y*x.dot(beta)))) + lambduh * np.linalg.norm(beta)**2

def backtracking(beta, lambduh, x, y, eta=1, alpha=0.5, betaparam=0.8, maxiter=100):
    grad_beta = computegrad(beta, lambduh, x=x, y=y)
    norm_grad_beta = np.linalg.norm(grad_beta)
    found_eta = 0
    iter = 0
    while found_eta == 0 and iter < maxiter:
        if objective(beta - eta * grad_beta, lambduh, x=x, y=y) < \
            objective(beta, lambduh, x=x, y=y)- alpha * eta * norm_grad_beta ** 2:
                found_eta = 1
        elif iter == maxiter:
            raise ('Max number of iterations of backtracking line search reached')
        else:
            eta *= betaparam
            iter += 1
    return eta

# graddescent with backtracking
def graddescent(beta_init, lambduh, eta_init, x, y, eps=1e-4):
    beta = beta_init
    grad_beta = computegrad(beta, lambduh, x=x, y=y)
    beta_vals = beta
    iter = 0
    while np.linalg.norm(grad_beta) > eps:
        eta = backtracking(beta, lambduh, eta=eta_init, x=x, y=y)
        beta = beta - eta*grad_beta
        # Store all of the places we step to
        beta_vals = np.vstack((beta_vals, beta))
        grad_beta = computegrad(beta, lambduh, x=x, y=y)
        iter += 1
    return beta_vals

# accelerated gradient descent with backtracking
def fastgradalgo(beta_init, theta_init, lambduh, eta_init, x, y, eps=1e-4):
    beta = beta_init
    theta = theta_init
    grad_theta = computegrad(theta, lambduh, x=x, y=y)
    grad_beta = computegrad(beta, lambduh, x=x, y=y)
    beta_vals = beta
    theta_vals = theta
    iter = 0
    while np.linalg.norm(grad_beta) > eps:
        eta = backtracking(theta, lambduh, eta=eta_init, x=x, y=y)
        beta_new = theta - eta*grad_theta
        theta = beta_new + iter/(iter+3)*(beta_new-beta)
        # Store all of the places we step to
        beta_vals = np.vstack((beta_vals, beta))
        theta_vals = np.vstack((theta_vals, theta))
        grad_theta = computegrad(theta, lambduh, x=x, y=y)
        grad_beta = computegrad(beta, lambduh, x=x, y=y)
        beta = beta_new
        iter += 1
    return beta_vals

def compute_misclassification_error(beta_opt, x, y):
    y_pred = 1/(1+np.exp(-x.dot(beta_opt))) > 0.5
    y_pred = y_pred*2 - 1 # Convert to +/- 1
    return np.mean(y_pred != y)

def plot_misclassification_error(betas_grad, betas_fastgrad, x, y, save_file='', title=''):
    niter_grad = np.size(betas_grad, 0)
    error_grad = np.zeros(niter_grad)
    niter_fg = np.size(betas_fastgrad, 0)
    error_fastgrad = np.zeros(niter_fg)
    for i in range(niter_grad):
        error_grad[i] = compute_misclassification_error(betas_grad[i, :], x, y)
    for i in range(niter_fg):
        error_fastgrad[i] = compute_misclassification_error(betas_fastgrad[i, :], x, y)
    fig, ax = plt.subplots()
    ax.plot(range(1, niter_grad + 1), error_grad, label='gradient descent')
    ax.plot(range(1, niter_fg + 1), error_fastgrad, c='red', label='fast gradient')
    plt.xlabel('Iteration')
    plt.ylabel('Misclassification error')
    if title:
        plt.title(title)
    ax.legend(loc='upper right')
    if not save_file:
        plt.show()
    else:
        plt.savefig(save_file)