# <center> ZO-Logistic Regression
    

## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from models import logisticReg
from simus import run_log
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
from tqdm.notebook import tqdm

## Generate dataset for classification

In [None]:
# without intercept
def simu_block(seed,n_samples,n_features,puiss,block_size,noise):
    np.random.seed(seed)
    X = np.zeros((n_samples,n_features))
    for j in range(n_features//block_size):
        X_j = np.random.normal(scale=(j+1)**(-puiss),size=(n_samples,block_size))
        X[:,j*block_size:(j+1)*block_size] = X_j
    # shuffle columns of X
    indices = np.arange(n_features)
    np.random.shuffle(indices)
    X[:,:] = X[:, indices]
    ground_truth = np.random.uniform(low=0,high=1,size=n_features)
    y = np.ones(n_samples)
    h = 1/(1+np.exp(-X@ground_truth))
    if noise > 0.0:
        h += np.random.normal(scale=noise, size=y.shape)
    y[h<=0.5]=-1
    #indices = np.arange(n_features)
    #np.random.shuffle(indices)
    #X[:,:] = X[:, indices]
    # Add noise
    return X, y

In [None]:
puiss=5
# Parameters
n_samples = 10000   # number of samples
n_features = 250    # dimension of the problem
𝜆 = 1/(n_samples)#regularization parameter
# Simulate data for regression
seed=0
noise=0.01
block_size=5
X,y=simu_block(seed=seed,n_samples=n_samples,n_features=n_features,
               puiss=puiss,block_size=block_size,noise=noise)
print(y.sum())

In [None]:
np.unique(y,return_counts=True)

## Compute true solution

In [None]:
λ = 1/X.shape[0]
c = 1/(X.shape[0]*λ)
log_sk = LogisticRegression(C=c,fit_intercept=False,tol=1e-6)
# fit sklearn model
log_sk.fit(X=X,y=y)
coeff = log_sk.coef_[0]

In [None]:
data_term  = np.log(1+np.exp(np.multiply(-y,X@coeff))).mean()
reg_term = (𝜆/2)*sum(coeff**2)
print('data_term:',data_term)
print('reg_term :',reg_term)
# Optimal loss
log = logisticReg(X=X,y=y,λ=λ,fit_intercept=False)
loss_opt = log.loss(batch=np.arange(X.shape[0]),w=coeff)
print('loss_opt :',loss_opt)

## Parameter Simulations

In [None]:
n_samples,n_features = X.shape
N = int(200)  # number of passes over coordinates          
a = 10           # numerator of learning rate
t0 = 5
alpha_power = 1 # power in the learning rate
gamma = 1       # numerator in gradient factor smoothing
mu_power = 1    # power in the gradient factor smoothing
verbose = False # to display information
N_exp = 20     # number of experiments
fixed=False
eta = 0.5
#fixed = True
batch_size = 1

print('𝜆=  ',𝜆)
print('eta=',eta)
T = int(np.sqrt(n_features)) # size of exploration
#T = n_features
print('T=  ',T)

# Run different ZO methods

## Full gradient estimate

In [None]:
_,_,loss_full = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,method='full',N_exp=N_exp,N=N,
                   T=None,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                   fixed=None,eta=None,importance=None,gains=None,verbose=False)
l_ful = np.mean(loss_full,axis=0)
std_ful = np.std(loss_full,axis=0)

## Uniform coordinate sampling

In [None]:
_,_,loss_uni = run_log(X=X,y=y,𝜆=𝜆,batch_size=batch_size,fit_intercept=False,
                               method='uni',N_exp=N_exp,N=N,
                           T=None,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                        fixed=None,eta=None,importance=None,gains=None,verbose=False)
l_uni = np.mean(loss_uni,axis=0)
std_uni = np.std(loss_uni,axis=0)

## Musketeer biased (with different gains: Average, Absolute Value, Square)

In [None]:
_,_,loss_mus_avg = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                       method='mus',N_exp=N_exp,N=N,
                       T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                       fixed=fixed,eta=eta,importance=False,gains='avg',
                       verbose=False)
l_avg = np.mean(loss_mus_avg,axis=0)
std_avg = np.std(loss_mus_avg,axis=0)


_,_,loss_mus_abs = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                       method='mus',N_exp=N_exp,N=N,
                       T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                       fixed=fixed,eta=eta,importance=False,gains='abs',
                       verbose=False)
l_abs = np.mean(loss_mus_abs,axis=0)
std_abs = np.std(loss_mus_abs,axis=0)

_,_,loss_mus_sqr = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                       method='mus',N_exp=N_exp,N=N,
                       T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                       fixed=fixed,eta=eta,importance=False,gains='square',
                       verbose=False)
l_sqr = np.mean(loss_mus_sqr,axis=0)
std_sqr = np.std(loss_mus_sqr,axis=0)

## Gaussian smoothing estimate (Nesterov-Spokoiny)

In [None]:
_,_,loss_nes = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                       method='nes',N_exp=N_exp,N=N,
                       T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                       fixed=fixed,eta=eta,importance=False,gains='square',
                       verbose=False)
l_nes = np.mean(loss_nes,axis=0)
std_nes = np.std(loss_nes,axis=0)

# save all results

In [None]:
#np.save('loss_ful_log_zo_puiss_{}.npy'.format(puiss),loss_full)
#np.save('loss_uni_log_zo_puiss_{}.npy'.format(puiss),loss_uni)
#np.save('loss_avg_log_zo_puiss_{}.npy'.format(puiss),loss_mus_avg)
#np.save('loss_abs_log_zo_puiss_{}.npy'.format(puiss),loss_mus_abs)
#np.save('loss_sqr_log_zo_puiss_{}.npy'.format(puiss),loss_mus_sqr)
#np.save('loss_nes_log_zo_puiss_{}.npy'.format(puiss),loss_nes)

# Run experiments in different settings (Appendix E)

In [None]:
puiss=5
seed=0
noise=0.01
block_size=5
N = int(200)  # number of passes over coordinates          
a = 10           # numerator of learning rate
t0 = 5
alpha_power = 1 # power in the learning rate
gamma = 1       # numerator in gradient factor smoothing
mu_power = 1    # power in the gradient factor smoothing
verbose = False # to display information
N_exp = 20     # number of experiments
fixed=False
eta = 0.5
#fixed = True
batch_size = 1
# Parameters
for n_samples in [1000,2000,5000]:
    for n_features in [20,50,100,200]:
        print('n=',n_samples)
        print('p=',n_features)
        T = int(np.sqrt(n_features)) # size of exploration
        # Generate data for classification
        X,y=simu_block(seed=seed,n_samples=n_samples,n_features=n_features,
               puiss=puiss,block_size=block_size,noise=noise)
        λ = 1/X.shape[0]
        c = 1/(X.shape[0]*λ)
        log_sk = LogisticRegression(C=c,fit_intercept=False,tol=1e-6)
        # fit sklearn model
        log_sk.fit(X=X,y=y)
        coeff = log_sk.coef_[0]
        data_term  = np.log(1+np.exp(np.multiply(-y,X@coeff))).mean()
        reg_term = (𝜆/2)*sum(coeff**2)
        loss_opt = data_term + reg_term
        # Run different ZO methods for Logistic regression
        _,_,loss_full = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,method='full',N_exp=N_exp,N=N,
                   T=None,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                   fixed=None,eta=None,importance=None,gains=None,verbose=False)
        _,_,loss_uni = run_log(X=X,y=y,𝜆=𝜆,batch_size=batch_size,fit_intercept=False,
                               method='uni',N_exp=N_exp,N=N,
                           T=None,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                        fixed=None,eta=None,importance=None,gains=None,verbose=False)
        _,_,loss_avg = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                       method='mus',N_exp=N_exp,N=N,
                       T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                       fixed=fixed,eta=eta,importance=True,gains='avg',
                       verbose=False)
        _,_,loss_abs = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                       method='mus',N_exp=N_exp,N=N,
                       T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                       fixed=fixed,eta=eta,importance=True,gains='abs',
                       verbose=False)
        _,_,loss_sqr = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                               method='mus',N_exp=N_exp,N=N,
                               T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                               fixed=fixed,eta=eta,importance=True,gains='square',
                               verbose=False)
        _,_,loss_nes = run_log(X=X,y=y,𝜆=𝜆,fit_intercept=False,batch_size=batch_size,
                               method='nes',N_exp=N_exp,N=N,
                               T=T,gamma=gamma,mu_power=mu_power,a=a,t0=t0,alpha_power=alpha_power,
                               fixed=fixed,eta=eta,importance=False,gains='square',
                               verbose=False)
        #np.save('loss_ful_log_n{}_p{}.npy'.format(n_samples,n_features),loss_full-loss_opt)
        #np.save('loss_uni_log_n{}_p{}.npy'.format(n_samples,n_features),loss_uni-loss_opt)
        #np.save('loss_nes_log_n{}_p{}.npy'.format(n_samples,n_features),loss_nes-loss_opt)
        #np.save('loss_avg_is_log_n{}_p{}.npy'.format(n_samples,n_features),loss_avg-loss_opt)
        #np.save('loss_sqr_is_log_n{}_p{}.npy'.format(n_samples,n_features),loss_sqr-loss_opt)
        #np.save('loss_abs_is_log_n{}_p{}.npy'.format(n_samples,n_features),loss_abs-loss_opt)