In [3]:
import matplotlib.pyplot as plt
import numpy as np
# import dsdl
import numpy.random
from typing import Callable, List
import pandas as pd
import json

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# !pip install git+https://github.com/fkunstner/dataset-downloader.git

In [4]:
DATASETS = ['a1a', 'mushrooms', 'a6a', 'w1a', 'w5a', 'ionosphere']
METHODS = ['SGD', 'AdaSpider', 'Spider', 'SpiderBoost', 'SVRG', 'AdaGrad', 'AdaSVRG']

In [5]:
import os
import sys
module_path = "/root/ML-Reproducibility-Challange/"
if module_path not in sys.path:
    sys.path.append(module_path)

# Logistic regression

In [6]:
# for dataset_name in DATASETS:
#     X, y = get_data(dataset_name)
#     pd.DataFrame(np.c_[X, y]).to_csv(f"{dataset_name}.csv", index=False)

In [7]:
from src.optimizers.Optimizer import Optimizer

In [8]:
def get_data(dataset_name: str):
    """
    :param dataset_name: Name of the dataset from dsdl module.
    :return: (X, y) train and target data.
    """
    ds = dsdl.load(dataset_name)
    X, y = ds.get_train()
    X = X.toarray()
    y = y.reshape(-1, 1)
    return X, y

def build_model(X, y):
    """
    Joins weights
    :param X: shape=(N, D). Train data
    :param y: shape=(N, 1). Target data
    :return: shape=(N, D+1). Built model for logistic regression.
    """
    return np.c_[np.zeros((y.shape[0], 1)), X]

def get_initial_weights(tx):
    """
    Returns weights initialized from the uniform distribution [0, 1].
    :param tx: shape=(N, D). Build model
    :return: shape=(D, 1) Initial weights
    """
    np.random.seed(2022)
    return np.zeros(shape=(tx.shape[1], 1))

def test_method(method: Optimizer,
                initial_weights,
                tx,
                y,
                max_iter: int,
                *parameter):
        """
        :param method: Optimization method implementation from src optimizers module.
        :param dataset_name: Name of the dataset from dsdl module.
        :param max_iter: Number of iterations to test.
        :param parameters optional: Dataclass containing parameters used int optimization method.
        :return: List of gradients from optimization method.
        """
        gradients, loss = method.optimize(initial_weights, tx, y, max_iter)
        return [np.linalg.norm(grad, 2) for grad in gradients], loss

In [13]:
from src.optimizers.SGD import SGD
from src.optimizers.AdaSpider import AdaSpider
from src.optimizers.Spider import Spider
from src.optimizers.SpiderBoost import SpiderBoost
from src.optimizers.SVRG import SVRG
from src.optimizers.AdaGrad import AdaGrad
from src.optimizers.AdaSVRG import AdaSVRG

In [32]:
N_ORACLE_CALLS = 1300

METHODS = [
    # SGD(lambda_=0.0001, q=N_ORACLE_CALLS),
    # AdaSpider(q=N_ORACLE_CALLS),
    # Spider(n_0 = 0.0001, epsilon=0.0001, q=N_ORACLE_CALLS),
    # SpiderBoost(q=N_ORACLE_CALLS),
    SVRG(lambda_=0.0001, q=N_ORACLE_CALLS)
    # AdaGrad(lambda_=0.0001, epsilon= 0.0001, q=N_ORACLE_CALLS),
    # AdaSVRG(lambda_=0.01, q=N_ORACLE_CALLS)
]

## Run simulations

In [27]:
# run 2 # 20 mins on 8 cpu avs ml.c5.2xlarge
# run 1 # 1 hour later on the 4 cpu machine 

In [None]:
ITERATIONS = 100000
N_RUNS = 5

all_gradients = []

datasets_data = {}
for i, dataset_name in enumerate(DATASETS):

    print("Dataset", dataset_name)

    X, y = get_data(dataset_name)
    tx = build_model(X, y)
    initial_weights = get_initial_weights(tx)

    methods_data = {}
    for method in METHODS:
        print("Method", method.name)

        gradients_5_runs = list()
        for _ in range(N_RUNS):
            gradients, _ = test_method(method, initial_weights, tx, y, ITERATIONS)
            gradients_5_runs.append(gradients)
        gradients_mean = np.mean(gradients_5_runs, axis=0)

        stddev = np.std(gradients_5_runs, axis=0)
        lower = gradients_mean - stddev
        upper = gradients_mean + stddev

        methods_data[method.name] = {
            "gradient_mean": list(gradients_mean),
            "lower": list(lower),
            "upper": list(upper),
            "n_runs": N_RUNS,
            "n_iterations": ITERATIONS
        }
        datasets_data[dataset_name] = methods_data

    break  # plot single dataset
print("DONE")

Dataset a1a
Method SVRG
Full grad, iter: 0
Full grad, iter: 1300


  return 1.0 / (1 + np.exp(-t))


Full grad, iter: 2600
Full grad, iter: 3900
Full grad, iter: 5200
Full grad, iter: 6500
Full grad, iter: 7800
Full grad, iter: 9100
Full grad, iter: 10400
Full grad, iter: 11700
Full grad, iter: 13000
Full grad, iter: 14300
Full grad, iter: 15600
Full grad, iter: 16900
Full grad, iter: 18200
Full grad, iter: 19500
Full grad, iter: 20800
Full grad, iter: 22100
Full grad, iter: 23400
Full grad, iter: 24700
Full grad, iter: 26000
Full grad, iter: 27300
Full grad, iter: 28600
Full grad, iter: 29900
Full grad, iter: 31200
Full grad, iter: 32500
Full grad, iter: 33800
Full grad, iter: 35100
Full grad, iter: 36400
Full grad, iter: 37700
Full grad, iter: 39000
Full grad, iter: 40300
Full grad, iter: 41600
Full grad, iter: 42900
Full grad, iter: 44200
Full grad, iter: 45500
Full grad, iter: 46800
Full grad, iter: 48100
Full grad, iter: 49400
Full grad, iter: 50700
Full grad, iter: 52000
Full grad, iter: 53300
Full grad, iter: 54600
Full grad, iter: 55900
Full grad, iter: 57200
Full grad, iter: 

In [None]:
with open('run_6_svrg_100_000.json', 'w') as file:
    json.dump(datasets_data, file)

In [57]:
# get the mean length of gradients
print("DONE")
min_length = None
for key, val in datasets_data['a1a'].items():
    current_length = len(val['gradient_mean'])
    if min_length is None:
        min_length = current_length
    elif min_length > current_length:
        min_length = current_length
min_length

DONE


100

In [20]:
datasets_data

NameError: name 'datasets_data' is not defined

In [6]:
with open("run_5.json", 'r') as file:
    datasets_data = json.load(file)

In [13]:

datasets_data['a1a']['SGD'].keys()
# datasets_data['a1a']['AdaSpider']
# datasets_data['a1a']['Spider']
# datasets_data['a1a']['SpiderBoost']
# datasets_data['a1a']['SVRG']
# datasets_data['a1a']['AdaGrad']
# datasets_data['a1a']['AdaSVRG']

dict_keys(['gradient_mean', 'lower', 'upper', 'n_runs', 'n_iterations'])

### Plot

In [None]:
X_LABEL = "Stochastic oracle calls"
Y_LABEL = "||\u0394f(x)||^2"

fig, ax = plt.subplots(1, 1, figsize=(10, 5), sharey=False, sharex=False)  # 3, 2

for i, dataset_name in enumerate(DATASETS):
    print("Dataset", dataset_name)

    methods_data = datasets_data[dataset_name]

    # sbplt = ax[i%3, i%2]
    sbplt = ax

    for method_name, method_data in methods_data.items():
        print("Method", method_name)
        gradients_mean = method_data['gradient_mean']
        lower = method_data['lower']
        upper = method_data['upper']
        n_iterations = method_data['n_iterations']

        sbplt.plot(gradients_mean, label=method_name)
        sbplt.fill_between(list(range(len(gradients_mean[:min_length]))), lower[:min_length], upper[:min_length], alpha=0.25,
                           facecolor='red', edgecolor='red')

    sbplt.set_xscale('log')
    sbplt.set_yscale('log')
    # set y tics
    # print(list(np.arange(1e-24))[::-1])
    # sbplt.set_ytics(list(np.arange(1e-24))[::-1])
    sbplt.set_title(dataset_name)
    sbplt.set_xlabel(X_LABEL)
    sbplt.set_ylabel(Y_LABEL)
    sbplt.legend(loc='lower left', fontsize='small')

    break  # plot single dataset

fig.tight_layout(pad=2.0)
fig.savefig(f'tests_logistic_regression_run_6_SVRG.jpg', dpi=300)

# plot_data()

Do parameter sweep

In [15]:
from itertools import product

In [16]:
powers = np.array([range(-5, 0)], dtype=float)
parameters = 10**powers.flatten()

In [17]:
REDUCTION_STEP = 100

METHODS = [
    SGD(lambda_=0.01, q=REDUCTION_STEP),
    # AdaSpider(q=REDUCTION_STEP),
    Spider(n_0 = 1, epsilon=0.01, lambda_=0.01, q=REDUCTION_STEP),
    SpiderBoost(lambda_=0.01, q=REDUCTION_STEP),
    SVRG(lambda_=0.001, q=REDUCTION_STEP),
    AdaGrad(lambda_=0.5, epsilon= 0.00001, q=REDUCTION_STEP),
    AdaSVRG(lambda_=0.1, epsilon= 0.00001, q=REDUCTION_STEP)
]

DF_COLUMNS = ['method_name', 'loss', 'parameters']
df = pd.DataFrame(columns=DF_COLUMNS)

dataset_name = 'a1a'
ITERATIONS = 3000
print("Dataset", dataset_name, "Iterations: ", ITERATIONS)

powers = np.array([range(-5, 0)], dtype=float)
parameters = 10**powers.flatten()

X, y = get_data(dataset_name)
tx = build_model(X, y)
initial_weights = get_initial_weights(tx)

print("Parameters values: ", parameters)

for method in METHODS:
    print("Method", method.name)
    if method.n_params_to_tune == 2:
        for param in list(product(parameters, parameters)):
            method.set_params(param[0], param[1])
            _, losses = test_method(method, initial_weights, tx, y, ITERATIONS)
            df = df.append(dict(zip(DF_COLUMNS, [method.name, np.sum(losses), str(param)])), ignore_index=True)
    
    elif method.n_params_to_tune == 3:
        for param in list(product(parameters, parameters)):
            method.set_params(param[0], param[1], param[2])
            _, losses = test_method(method, initial_weights, tx, y, ITERATIONS)
            df = df.append(dict(zip(DF_COLUMNS, [method.name, np.sum(losses), str(param)])), ignore_index=True)

    
    elif method.n_params_to_tune == 1:
        for param in parameters:
            method.set_params(param)
            _, losses = test_method(method, initial_weights, tx, y, ITERATIONS)
            df = df.append(dict(zip(DF_COLUMNS, [method.name, np.sum(losses), str(param)])), ignore_index=True)
    else:
        _, losses = test_method(method, initial_weights, tx, y, ITERATIONS)
        df = df.append(dict(zip(DF_COLUMNS, [method.name, np.sum(losses), str(param)])), ignore_index=True)

Dataset a1a Iterations:  3000
Parameters values:  [1.e-05 1.e-04 1.e-03 1.e-02 1.e-01]
Method SGD
Method Spider
Method SpiderBoost
Method SVRG
Method AdaGrad


  first_component = np.log(1 + np.exp(tx.dot(w)))


Method AdaSVRG


In [19]:
df.to_csv("parameters_sweep_4.csv", index=False)

In [22]:
df

Unnamed: 0,method_name,loss,parameters
0,SGD,1855.114364,1e-05
1,SGD,323.886964,0.0001
2,SGD,7221.606880,0.001
3,SGD,78914.015151,0.01
4,SGD,612348.603799,0.1
...,...,...,...
85,AdaSVRG,3622.251210,"(0.1, 1e-05)"
86,AdaSVRG,3622.496966,"(0.1, 0.0001)"
87,AdaSVRG,3622.494324,"(0.1, 0.001)"
88,AdaSVRG,3622.497517,"(0.1, 0.01)"


In [21]:
df['loss'] = df['loss'].abs()
df.groupby('method_name').min()

Unnamed: 0_level_0,loss,parameters
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AdaGrad,2059.296693,"(0.0001, 0.0001)"
AdaSVRG,3622.25121,"(0.0001, 0.0001)"
SGD,323.886964,0.0001
SVRG,385412.914067,0.0001
Spider,1543.390569,"(0.0001, 0.0001)"
SpiderBoost,1603.184718,0.0001


In [65]:
# METHODS = ['AdaSpider', 'Spider']
# ITERATIONS = 100
#
# def plot_data():
#     # Write your code to make 4x4 panel here
#     X_LABEL = "Stochastic oracle calls"
#     Y_LABEL = "||\u0394f(x)||^2"
#
#     fig, ax = plt.subplots(3,2,figsize=(16,16), sharey=False, sharex=False)
#
#     for i, dataset_name in enumerate(DATASETS):
#         sbplt = ax[i%3, i%2]
#         print(dataset_name)
#         for method in METHODS:
#             if method == "Spider":
#                 spider_params = SpiderParam(100, 5, 0.05)
#                 gradients = test_method(Spider, dataset_name, ITERATIONS, spider_params)
#             else:
#                 gradients = test_method(ADASpider, dataset_name, ITERATIONS)
#             gradients = [np.linalg.norm(grad, 2) for grad in gradients]
#             sbplt.plot(gradients, label=method)
#
#         sbplt.set_xscale('log')
#         sbplt.set_title(dataset_name)
#         sbplt.set_xlabel(X_LABEL)
#         sbplt.set_ylabel(Y_LABEL)
#         sbplt.legend(loc='lower left')
#
#         break  # plot single dataset
#
#     fig.tight_layout(pad=2.0)
#     # fig.savefig('tests_logistic_regression.jpg', dpi=150)
#
# plot_data()