In [1]:
import pandas as pd
import numpy as np
import chocolate as choco
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from itertools import product
import warnings
warnings.filterwarnings("ignore")
np.random.seed(0)

# Data Loading/Preprocessing

In [2]:
df = pd.read_csv('~/DATA/mammographic_masses.data', na_values='?',
                 names=['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity'])
df = df.drop('BI-RADS', axis=1).dropna()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:4], df.iloc[:,4], train_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5)

In [4]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)

# Chocolate

In [5]:
space = [
    {'algo': 'SVC',
        "gamma": 'auto',
        "C": choco.log(-3, 3, 10),
        "kernel": choco.choice(['linear', 'poly', 'rbf', 'sigmoid']),
        "tol": choco.log(-5, -2, 10),},
    
    {'algo': 'RandomForestClassifier',
        "max_depth"       : choco.quantized_uniform(2, 10, 2),
        "min_samples_leaf": choco.quantized_uniform(2, 10, 2),
        "n_estimators"    : choco.quantized_uniform(25, 525, 25),},
    
    {'algo': 'GaussianNB',
        "var_smoothing"   : choco.log(-12, -6, 10)},
    
    {'algo': 'KNeighborsClassifier',
        "n_neighbors"     : choco.quantized_uniform(1, 10, 1),
        "weights"         : choco.choice(['uniform', 'distance']),
        "leaf_size"       : choco.quantized_uniform(15, 315, 20),
        "p"               : choco.choice([1,2,3]),},
    
    {'algo': 'MLPClassifier',
        "hidden_layer_sizes": choco.choice(
            list(product(range(2, 9), range(2, 9))) +
                [(i,) for i in range(2,9)]
            ),
        "activation": choco.choice(['relu', 'logistic']),
        "learning_rate_init": choco.log(-12, -1, 10)},
]

algos = {
    'SVC': SVC,
    'RandomForestClassifier': RandomForestClassifier,
    'GaussianNB': GaussianNB,
    'KNeighborsClassifier': KNeighborsClassifier,
    'MLPClassifier': MLPClassifier
}

In [6]:
def f1_score(trn_x, trn_y, tst_x, tst_y, algo, **params):
    m = algos[algo](**params)
    m.fit(trn_x, trn_y)
    y_pred = m.predict(tst_x)
    return -1*skm.f1_score(tst_y, y_pred)

In [7]:
def getResults(conn):
    if isinstance(conn, choco.SQLiteConnection):
        df = conn.results_as_dataframe()
    elif type(conn) == str:
        conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % conn)
        df = conn.results_as_dataframe()
    else:
        raise ValueError()
    # Reorder Columns
    cols = list(df.columns)
    cols.remove('_loss')
    df = df[['_loss']+cols]
    
    return df

In [8]:
def getBestParams(conn):
    if isinstance(conn, choco.SQLiteConnection):
        df = conn.results_as_dataframe()
    elif type(conn) == str:
        conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % conn)
        df = conn.results_as_dataframe()
    else:
        raise ValueError()
        
    df = df.sort_values('_loss').iloc[0].dropna().drop('_loss')
    return df.to_dict()

In [9]:
N_RUNS = 10

conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/db.db")
conn.clear()

searcher = choco.Bayes(conn, space)

for _ in range(N_RUNS):
    token, params = searcher.next()
    loss = f1_score(X_train, y_train, X_val, y_val, **params)
    searcher.update(token, loss)


In [10]:
getBestParams(conn)

{'algo': 'RandomForestClassifier',
 'max_depth': 2.0,
 'min_samples_leaf': 8.0,
 'n_estimators': 500.0}

In [11]:
getResults(conn)

Unnamed: 0_level_0,_loss,C,activation,algo,gamma,hidden_layer_sizes,kernel,leaf_size,learning_rate_init,max_depth,min_samples_leaf,n_estimators,n_neighbors,p,tol,var_smoothing,weights
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,-0.708571,,,KNeighborsClassifier,,,,215.0,,,,,7.0,2.0,,,distance
1,-0.819672,,,RandomForestClassifier,,,,,,8.0,8.0,425.0,,,,,
2,-0.827957,,,RandomForestClassifier,,,,,,2.0,8.0,500.0,,,,,
3,-0.710059,21.080621,,SVC,auto,,sigmoid,,,,,,,,0.001398,,
4,-0.775956,,,KNeighborsClassifier,,,,95.0,,,,,5.0,3.0,,,uniform
5,-0.821622,,,GaussianNB,,,,,,,,,,,,7.797717e-07,
6,-0.260355,,relu,MLPClassifier,,"(6, 4)",,,1e-06,,,,,,,,
7,-0.826087,,,RandomForestClassifier,,,,,,8.0,8.0,500.0,,,,,
8,-0.674286,,,KNeighborsClassifier,,,,155.0,,,,,1.0,1.0,,,uniform
9,-0.789189,,,KNeighborsClassifier,,,,75.0,,,,,9.0,2.0,,,uniform
