In [1]:
import pandas as pd
import numpy as np
import chocolate as choco
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from itertools import product
import warnings
warnings.filterwarnings("ignore")
np.random.seed(0)

# Data Loading/Preprocessing

In [2]:
df = pd.read_csv('~/DATA/mammographic_masses.data', na_values='?',
                 names=['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity'])
df = df.drop('BI-RADS', axis=1).dropna()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:4], df.iloc[:,4], train_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5)

In [4]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)

# Chocolate

In [5]:
space = [
    {'algo': SVC,
        "gamma": 'auto',
        "C": choco.log(-3, 3, 10),
        "kernel": choco.choice(['linear', 'poly', 'rbf', 'sigmoid']),
        "tol": choco.log(-5, -2, 10),},
    
    {'algo': RandomForestClassifier,
        "max_depth"       : choco.quantized_uniform(2, 10, 2),
        "min_samples_leaf": choco.quantized_uniform(2, 10, 2),
        "n_estimators"    : choco.quantized_uniform(25, 525, 25),},
    
    {'algo': GaussianNB,
        "var_smoothing"   : choco.log(-12, -6, 10)},
    
    {'algo': KNeighborsClassifier,
        "n_neighbors"     : choco.quantized_uniform(1, 10, 1),
        "weights"         : choco.choice(['uniform', 'distance']),
        "leaf_size"       : choco.quantized_uniform(15, 315, 20),
        "p"               : choco.choice([1,2,3]),},
    
    {'algo': MLPClassifier,
        "hidden_layer_sizes": choco.choice(
            list(product(range(2, 9), range(2, 9))) +
                [(i,) for i in range(2,9)]
            ),
        "activation": choco.choice(['relu', 'logistic']),
        "learning_rate_init": choco.log(-12, -1, 10)},
]

In [6]:
def f1_score(trn_x, trn_y, tst_x, tst_y, algo, **params):
    m = algo(**params)
    m.fit(trn_x, trn_y)
    y_pred = m.predict(tst_x)
    return -1*skm.f1_score(tst_y, y_pred)

In [7]:
def getResults(conn):
    if isinstance(conn, choco.SQLiteConnection):
        df = conn.results_as_dataframe()
    elif type(conn) == str:
        conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % conn)
        df = conn.results_as_dataframe()
    else:
        raise ValueError()
    # Reorder Columns
    cols = list(df.columns)
    cols.remove('_loss')
    df = df[['_loss']+cols]
    
    return df

In [8]:
def getBestParams(conn):
    if isinstance(conn, choco.SQLiteConnection):
        df = conn.results_as_dataframe()
    elif type(conn) == str:
        conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % conn)
        df = conn.results_as_dataframe()
    else:
        raise ValueError()
        
    df = df.sort_values('_loss').iloc[0].dropna().drop('_loss')
    return df.to_dict()

In [9]:
N_RUNS = 10

conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/db.db")
conn.clear()
# searcher = choco.Random(conn, space)
searcher = choco.Bayes(conn, space, clear_db=True)

for _ in range(N_RUNS):
    token, params = searcher.next()
    loss = f1_score(X_train, y_train, X_val, y_val, **params)
    searcher.update(token, loss)

In [10]:
getResults(conn)

KeyError: 'algo_<class sklearn_ensemble_forest_RandomForestClassifier>_max_depth'

In [11]:
conn.all_results()

[OrderedDict([('id', 1),
              ('_subspace', 0.7147795134408216),
              ('algo_<class sklearn_svm_classes_SVC>_gamma_auto_C',
               0.6374048813052264),
              ('algo_<class sklearn_svm_classes_SVC>_gamma_auto_kernel',
               0.19033483232734738),
              ('algo_<class sklearn_svm_classes_SVC>_gamma_auto_tol',
               0.4400980971224393),
              ('algo_<class sklearn_ensemble_forest_RandomForestClassifier>_max',
               0.2712186112011562),
              ('algo_<class sklearn_ensemble_forest_RandomForestClassifier>_min',
               0.4110605765098101),
              ('algo_<class sklearn_ensemble_forest_RandomForestClassifier>_n_e',
               0.967759944688806),
              ('algo_<class sklearn_naive_bayes_GaussianNB>_var_smoothing',
               0.9256919489165832),
              ('algo_<class sklearn_neighbors_classification_KNeighborsClassifi',
               0.8347478196091109),
              ('algo_<c

In [12]:
len('algo_<class sklearn_ensemble_forest_RandomForestClassifier>_max')

63