# Logistic Regression with Hyperparameter Optimization (scikit-learn)

<a href="https://colab.research.google.com/github/VertaAI/modeldb-client/blob/master/workflows/demos/sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

import itertools
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

---

## Prepare Data

In [17]:
import wget
train_data_url = "http://s3.amazonaws.com/verta-starter/census-train.csv"
train_data_filename = wget.download(train_data_url)
test_data_url = "http://s3.amazonaws.com/verta-starter/census-test.csv"
test_data_filename = wget.download(test_data_url)

In [18]:
train_data_filename

'census-train.csv'

In [19]:
df_train = pd.read_csv("census-train.csv")
X_train = df_train.iloc[:,:-1].values
y_train = df_train.iloc[:, -1]

df_train.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,>50k
0,44,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,53,7298,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,49,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,53,0,1485,40,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


## Prepare Hyperparameters

In [32]:
hyperparam_candidates = {
    'C': [1e-4, 1e-1, 1, 10, 1e3],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Run Validation

In [33]:
def run_experiment(hyperparams):
    
    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    print(hyperparams, end=' ')
    print("Validation accuracy: {:.4f}".format(val_acc))
    
with Pool() as pool:
    pool.map(run_experiment, hyperparam_sets)

{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7993
{'C': 0.0001, 'solver': 'liblinear', 'max_iter': 15} Validation accuracy: 0.7999
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7995
{'C': 0.1, 'solver': 'liblinear', 'max_iter': 15} Validation accuracy: 0.8000
{'C': 0.0001, 'solver': 'liblinear', 'max_iter': 28} Validation accuracy: 0.7959
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7904
{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7910
{'C': 1, 'solver': 'liblinear', 'max_iter': 15} Validation accuracy: 0.7879
{'C': 0.1, 'solver': 'liblinear', 'max_iter': 28} Validation accuracy: 0.8455
{'C': 1, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7876
{'C': 1, 'solver': 'liblinear', 'max_iter': 28} Validation accuracy: 0.8478
{'C': 10, 'solver': 'liblinear', 'max_iter': 15} Validation accuracy: 0.7883
{'C': 10, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7872
{'C': 

## Train on Full Dataset

In [22]:
model = linear_model.LogisticRegression(**{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 15}, multi_class='auto')
model.fit(X_train, y_train)

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=15,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Calculate Accuracy on Full Training Set

In [23]:
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))

Training accuracy: 0.7902


---