# Accelerated cross-validation for binomial logistic regression
Demonstration of approximate cross-validation in binomial logistic regression with the l1 regularization. 

## Sample execution environment
* OS: macOS 10.12.6
* Processor: 2.5 GHz Intel Core i7
* RAM: 16 GB 2133 MHz LPDDR3

## Import libraries

In [1]:
%matplotlib notebook
import warnings

import numpy as np
import matplotlib.pyplot as plt

# from IPython.html.widgets import FloatProgress
from ipywidgets import FloatProgress
from IPython.display import display

# accelerated cross validation module
import accelerated_cv_on_mlr as acv

# glmnet for logistic regression and conventional 
import glmnet_python
from glmnet import glmnet
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef

# set seed
np.random.seed(0)

## Making dummy data 
* dummy training data and list of the $\lambda$s are made

In [2]:
%%time
alpha = 2  # Feature-to-data ratio
N = 400  # Feature vector dimensionality
rho0 = 0.5  #  Feature-vector density
M = np.ceil(alpha * N).astype(np.int64)  # Data dimensionality
K = np.ceil(rho0 * N).astype(np.int64)  # Nonzero-components number
sigmaW2 = 1.0 / rho0  # Approximately set feature-vector norm to sqrt(N)

# True fertures
w0 = np.zeros((N,1))
w0[:K] = np.sqrt(sigmaW2) * np.random.normal(loc=0.0, scale=1.0, size=(K,1))
X = np.random.normal(loc=0.0, scale=1.0, size=(M, N))
X_std = acv.utils.standardize_matrix.standardize_matrix(X)
u0 = X_std.dot(w0)

# Output data
Y = 1.0 / (1.0 + np.exp(-1.0 * u0)) > 0.5
Ycode = np.zeros((M, 2))
Ycode[:, 0] = (Y == 0).reshape(M,)
Ycode[:, 1] = (Y == 1).reshape(M,)

r_exp = 0.06
lambdaV = np.power(10.0, -r_exp * np.arange(-10, 41))

CPU times: user 74.2 ms, sys: 7.39 ms, total: 81.6 ms
Wall time: 81.2 ms


## Fitting model via glmnet

In [3]:
%%time
fit = glmnet(x=X_std.copy().astype(np.float64),
             y=Y.copy().astype(np.float64),
             family='binomial',
             intr=False,
             lambdau=lambdaV,
             thresh=1e-8,
             maxit=1e6,
             standardize=False
             )

CPU times: user 357 ms, sys: 6.56 ms, total: 364 ms
Wall time: 184 ms


## $k$-fold CV via glmnet

### $10$-fold

In [4]:
%%time
warnings.filterwarnings('ignore')
cvfit_10fold = cvglmnet(x=X_std.astype(np.float64).copy(),
                        y=Y.astype(np.float64).copy(),
                        family='binomial',
                        mtype="ungrouped",
                        lambdau=fit["lambdau"][:51],
                        intr=False,
                        standardize=False,
                        thresh=1e-8,
                        maxit=1e6,
                        parallel=True,
                        nfolds=10)
warnings.filterwarnings('default')

[status]	Parallel glmnet cv with 4 cores


CPU times: user 260 ms, sys: 29.7 ms, total: 290 ms
Wall time: 950 ms


### $100$-fold
* this may take long time

In [5]:
%%time
## WARNING: THIS MAY TAKE LONG TIME
cvfit_100fold = None
warnings.filterwarnings('ignore')
cvfit_100fold = cvglmnet(x=X_std.astype(np.float64).copy(),
                         y=Y.astype(np.float64).copy(),
                         family='multinomial',
                         mtype="ungrouped",
                         lambdau=fit["lambdau"][:51],
                         intr=False,
                         standardize=False,
                         thresh=1e-8,
                         maxit=1e7,
                         parallel=True,
                         nfolds=100)
warnings.filterwarnings('default')

[status]	Parallel glmnet cv with 4 cores


CPU times: user 962 ms, sys: 92.7 ms, total: 1.05 s
Wall time: 11.4 s


### $1000$-fold
* this may take extremely long time

In [6]:
%%time
## WARNING: THIS MAY TAKE EXTREMELY LONG TIME
cvfit_1000fold = None
warnings.filterwarnings('ignore')
cvfit_1000fold = cvglmnet(x=X_std.astype(np.float64).copy(),
                         y=Y.astype(np.float64).copy(),
                         family='multinomial',
                         mtype="ungrouped",
                         lambdau=fit["lambdau"][:51],
                         intr=False,
                         standardize=False,
                         thresh=1e-8,
                         maxit=1e7,
                         parallel=True,
                         nfolds=1000)
warnings.filterwarnings('default')

[status]	Parallel glmnet cv with 4 cores


CPU times: user 5.9 s, sys: 673 ms, total: 6.57 s
Wall time: 2min 1s


## Approximate Leave-One-Out-CV via ACV

In [7]:
# weight vector 
# note that some lambda values are ignored in cvglmnet
wV = np.zeros((N, 51))
for ilam in range(51):
    wV[:, ilam] = fit["beta"][:, ilam]

In [8]:
%%time
fp = FloatProgress(min=0, max=50 - 1, description="calculating...")
display(fp)

LOOE_list = []
ERR_list = []
for ilam in range(51):
    fp.value = ilam
    LOOE, ERR = acv.acv_logit(wV[:,  ilam].reshape(N, 1).transpose(), X_std, Ycode)
    LOOE_list.append(LOOE)
    ERR_list.append(ERR)

A Jupyter Widget

CPU times: user 620 ms, sys: 98.2 ms, total: 719 ms
Wall time: 324 ms


## Approximate Leave-One-Out-CV via SAACV

In [9]:
%%time
fp = FloatProgress(min=0, max=fit["lambdau"].shape[0] - 1, description="calculating...")
display(fp)

LOOE_SA_list = []
ERR_SA_list = []
for ilam in range(51):
    fp.value = ilam
    LOOE, ERR = acv.saacv_logit(wV[:, ilam].reshape(N, 1).transpose(), X_std, Ycode)
    LOOE_SA_list.append(LOOE)
    ERR_SA_list.append(ERR)

A Jupyter Widget

CPU times: user 226 ms, sys: 37.3 ms, total: 263 ms
Wall time: 134 ms


## Comparing result

In [10]:
########### 
fig = plt.figure(figsize=(6.5,10))
ax = fig.add_subplot(211)

x = cvfit_10fold["lambdau"]

y_acv = LOOE_list
err_acv = ERR_list

y_saacv = LOOE_SA_list
err_saacv = ERR_SA_list

y_glm_10fold = cvfit_10fold["cvm"] / 2.0
err_glm_10fold = cvfit_10fold["cvsd"] / np.sqrt(2.0)

ax.errorbar(x, y_acv, yerr=err_acv, fmt="o",
            capsize=5, ms=4, label="ACV", alpha=0.8)
ax.errorbar(x, y_saacv, yerr=err_saacv, fmt="x",
            capsize=5, ms=4, label="SAACV", alpha=0.8)
ax.errorbar(x, y_glm_10fold, yerr=err_glm_10fold, fmt="v",
            capsize=5, ms=4, label="$10$-fold(glm)", alpha=0.8)
if cvfit_100fold is not None:
    y_glm_100fold = cvfit_100fold["cvm"] / 2.0
    err_glm_100fold = cvfit_100fold["cvsd"] / np.sqrt(2.0)
    ax.errorbar(x, y_glm_100fold, yerr=err_glm_100fold, fmt="^",
                capsize=5, ms=4, label="$100$-fold(glm)", alpha=0.8)

if cvfit_1000fold is not None:
    y_glm_1000fold = cvfit_1000fold["cvm"] / 2.0
    err_glm_1000fold = cvfit_1000fold["cvsd"] / np.sqrt(2.0)
    ax.errorbar(x, y_glm_1000fold, yerr=err_glm_100fold, fmt="^",
                capsize=5, ms=4, label="$1000$-fold(glm)", alpha=0.8)

ax.grid()
ax.legend()
ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlim([min(x) * 0.8, max(x)])

ax.set_title("Simulated data, $N_p=${0}, $N=${1}".format(2, N))
ax.set_ylabel("CV errors")

ax.set_xlabel("$λ$")

###########

ax = fig.add_subplot(212)

x = cvfit_10fold["lambdau"]

y_acv = LOOE_list
y_saacv = LOOE_SA_list
y_glm_10fold = cvfit_10fold["cvm"] / 2.0
if cvfit_100fold is not None:
    y_glm_100fold = cvfit_100fold["cvm"] / 2.0
if cvfit_1000fold is not None:
    y_glm_1000fold = cvfit_1000fold["cvm"] / 2.0

ax.plot(x, y_acv/y_glm_10fold, "-o",label="ACV/10fold", c="b", ms=4)
if cvfit_100fold is not None:
    ax.plot(x, y_acv/y_glm_100fold, "--o", label="ACV/100fold", c="b", ms=4)
if cvfit_1000fold is not None:
    ax.plot(x, y_acv/y_glm_1000fold, "-.o", label="ACV/1000fold", c="b", ms=4)

ax.plot(x, y_saacv/y_glm_10fold, "-^",label="SAACV/10fold", c="r", ms=4)
if cvfit_100fold is not None:
    ax.plot(x, y_saacv/y_glm_100fold, "--^", label="SAACV/100fold", c="r", ms=4)
if cvfit_1000fold is not None:
    ax.plot(x, y_saacv/y_glm_1000fold, "-.^", label="SAACV/1000fold", c="r", ms=4)

ax.grid()
ax.legend()
ax.set_xscale("log")
ax.set_xlim([min(x) * 0.8, max(x)])

ax.set_title("Simulated data, $N_p=${0}, $N=${1} \n relative difference".format(2, N))
ax.set_ylabel("relative diff")

ax.set_xlabel("$λ$")

fig.tight_layout()

<IPython.core.display.Javascript object>