In [1]:
import pandas as pd
import numpy as np
import chocolate as choco
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from itertools import product
import warnings
warnings.filterwarnings("ignore")
np.random.seed(0)

# Data Loading/Preprocessing

In [2]:
df = pd.read_csv('~/DATA/mammographic_masses.data', na_values='?',
                 names=['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity'])
df = df.drop('BI-RADS', axis=1).dropna()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:4], df.iloc[:,4], train_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5)

In [4]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)

# [Chocolate](https://chocolate.readthedocs.io/tutorials/basics.html)

[Define your search space](https://chocolate.readthedocs.io/api/space.html)

In [5]:
svc_space = {
    "gamma": 'auto',
    "C": choco.log(-3, 3, 10),
    "kernel": choco.choice(['linear', 'poly', 'rbf', 'sigmoid']),
    "tol": choco.log(-5, -2, 10)
}

rfc_space = {
    "max_depth": choco.quantized_uniform(2, 10, 2),
    "min_samples_leaf": choco.quantized_uniform(2, 10, 2),
    "n_estimators": choco.quantized_uniform(25, 525, 25)
}

gnb_space = {
    "var_smoothing": choco.log(-12, -6, 10)
}

knn_space = {
    "n_neighbors": choco.quantized_uniform(1, 10, 1),
    "weights": choco.choice(['uniform', 'distance']),
    "leaf_size": choco.quantized_uniform(15, 315, 20),
    "p": choco.choice([1,2,3])
}

mlp_space = {
    "hidden_layer_sizes": choco.choice(
        list(product(range(2, 9), range(2, 9))) +
            [(i,) for i in range(2,9)]
        ),
    "activation": choco.choice(['relu', 'logistic']),
    "learning_rate_init": choco.log(-12, -1, 10)
}

Choose the metric to optmize over

In [6]:
def f1_score(trn_x, trn_y, tst_x, tst_y, model, **params):
    m = model(**params).fit(trn_x, trn_y)
    y_pred = m.predict(tst_x)
    return -1*skm.f1_score(tst_y, y_pred)

Choose your [sampler](https://chocolate.readthedocs.io/api/sample.html) or [searcher](https://chocolate.readthedocs.io/api/search.html)

In [14]:
def run_chocolate(model, param_space, X_train, y_train, X_val, y_val, clear_db=False, n_runs=10):
    conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % model.__name__)
    if clear_db: conn.clear()
    # searcher = choco.Random(conn, space)
    searcher = choco.Bayes(conn, param_space)

    for _ in range(n_runs):
        token, params = searcher.next()
        loss = f1_score(X_train, y_train, X_val, y_val, model, **params)
        searcher.update(token, loss)

    return conn

In [15]:
def getResults(conn):
    if isinstance(conn, choco.SQLiteConnection):
        df = conn.results_as_dataframe()
    elif type(conn) == str:
        conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % conn)
        df = conn.results_as_dataframe()
    else:
        raise ValueError()
    # Reorder Columns
    cols = list(df.columns)
    cols.remove('_loss')
    df = df[['_loss']+cols]
    
    return df

In [16]:
def getBestParams(conn):
    if isinstance(conn, choco.SQLiteConnection):
        df = conn.results_as_dataframe()
    elif type(conn) == str:
        conn = choco.SQLiteConnection(url="sqlite:///chocolate_dbs/%s.db" % conn)
        df = conn.results_as_dataframe()
    else:
        raise ValueError()
        
    df = df.sort_values('_loss').iloc[0].dropna().drop('_loss')
    return df.to_dict()

# Baseline Validation F1 Scores

In [17]:
svc_f1 = -f1_score(X_train, y_train, X_test, y_test, SVC)
rfc_f1 = -f1_score(X_train, y_train, X_test, y_test, RandomForestClassifier)
gnb_f1 = -f1_score(X_train, y_train, X_test, y_test, GaussianNB)
knn_f1 = -f1_score(X_train, y_train, X_test, y_test, KNeighborsClassifier)
mlp_f1 = -f1_score(X_train, y_train, X_test, y_test, MLPClassifier)

In [18]:
print("""
Baseline F1 scores
-----------------------------
SVC: %.2f
RandomForestClassifier: %.2f
GaussianNB: %.2f
KNeighborsClassifier: %.2f
MLPClassifier: %.2f
""" % (svc_f1, rfc_f1, gnb_f1, knn_f1, mlp_f1))


Baseline F1 scores
-----------------------------
SVC: 0.79
RandomForestClassifier: 0.74
GaussianNB: 0.76
KNeighborsClassifier: 0.75
MLPClassifier: 0.77



# Chocolate Hyperparameter Search

In [19]:
N_RUNS = 30
CLEAR_DB = True

svc_db = run_chocolate(
    SVC,
    svc_space,
    X_train,
    y_train,model
    X_val,
    y_val,
    n_runs=N_RUNS,
    clear_db=CLEAR_DB
)

rfc_db = run_chocolate(
    RandomForestClassifier,
    rfc_space,
    X_train,
    y_train,
    X_val,
    y_val,
    n_runs=N_RUNS,
    clear_db=CLEAR_DB
)

gnb_db = run_chocolate(
    GaussianNB,
    gnb_space,
    X_train,
    y_train,
    X_val,
    y_val,
    n_runs=N_RUNS,
    clear_db=CLEAR_DB
)

knn_db = run_chocolate(
    KNeighborsClassifier,
    knn_space,
    X_train,
    y_train,
    X_val,
    y_val,
    n_runs=N_RUNS,
    clear_db=CLEAR_DB
)

mlp_db = run_chocolate(
    MLPClassifier,
    mlp_space,
    X_train,
    y_train,
    X_val,
    y_val,
    n_runs=N_RUNS,
    clear_db=CLEAR_DB
)

In [20]:
getResults('SVC').sort_values('_loss').head()

Unnamed: 0_level_0,_loss,C,gamma,kernel,tol
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
24,-0.831579,0.001,auto,linear,6.7e-05
4,-0.831579,0.004944,auto,sigmoid,8.9e-05
12,-0.831579,0.001,auto,linear,1e-05
6,-0.831579,0.004742,auto,sigmoid,0.009608
10,-0.831579,0.001,auto,linear,0.01


In [21]:
getResults('RandomForestClassifier').sort_values('_loss').head()

Unnamed: 0_level_0,_loss,max_depth,min_samples_leaf,n_estimators
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-0.832432,6,8,275
1,-0.832432,4,4,100
18,-0.832432,4,8,25
27,-0.826087,6,8,100
26,-0.826087,4,6,500


In [22]:
getResults('GaussianNB').sort_values('_loss').head()

Unnamed: 0_level_0,_loss,var_smoothing
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.821622,1.002888e-12
27,-0.821622,6.023558e-09
26,-0.821622,5.555348e-08
25,-0.821622,8.58049e-09
24,-0.821622,5.102313e-10


In [23]:
getResults('KNeighborsClassifier').sort_values('_loss').head()

Unnamed: 0_level_0,_loss,leaf_size,n_neighbors,p,weights
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23,-0.804348,15,5,1,uniform
7,-0.804348,155,5,1,uniform
15,-0.797814,15,9,3,uniform
28,-0.79558,155,9,1,uniform
22,-0.791209,295,9,3,uniform


In [24]:
getResults('MLPClassifier').sort_values('_loss').head()

Unnamed: 0_level_0,_loss,activation,hidden_layer_sizes,learning_rate_init
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17,-0.83871,relu,"(3,)",0.099997
28,-0.823529,logistic,"(2, 4)",0.001326
8,-0.822222,relu,"(7, 5)",0.001135
19,-0.810811,relu,"(7, 7)",0.099997
14,-0.80663,relu,"(6, 5)",0.099997


In [25]:
getBestParams('MLPClassifier')

{'activation': 'relu',
 'hidden_layer_sizes': (3,),
 'learning_rate_init': 0.09999746718847377}

In [26]:
svc_f1_hpo = -f1_score(X_train, y_train, X_test, y_test, SVC, **getBestParams('SVC'))

best_rfc = getBestParams('RandomForestClassifier')
best_rfc['n_estimators'] = int(best_rfc['n_estimators'])
best_rfc['min_samples_leaf'] = int(best_rfc['min_samples_leaf'])
rfc_f1_hpo = -f1_score(X_train, y_train, X_test, y_test, RandomForestClassifier, **best_rfc)

gnb_f1_hpo = -f1_score(X_train, y_train, X_test, y_test, GaussianNB, **getBestParams('GaussianNB'))

knn_f1_hpo = -f1_score(X_train, y_train, X_test, y_test, KNeighborsClassifier, **getBestParams('KNeighborsClassifier'))

mlp_f1_hpo = -f1_score(X_train, y_train, X_test, y_test, MLPClassifier, **getBestParams('MLPClassifier'))

In [27]:
print("""
Baseline F1 scores
-----------------------------
SVC: %.2f
RandomForestClassifier: %.2f
GaussianNB: %.2f
KNeighborsClassifier: %.2f
MLPClassifier: %.2f
""" % (svc_f1, rfc_f1, gnb_f1, knn_f1, mlp_f1))


Baseline F1 scores
-----------------------------
SVC: 0.79
RandomForestClassifier: 0.74
GaussianNB: 0.76
KNeighborsClassifier: 0.75
MLPClassifier: 0.77



In [28]:
print("""
After Hyperparameter Optimization
----------------------------------
SVC: %.2f
RandomForestClassifier: %.2f
GaussianNB: %.2f
KNeighborsClassifier: %.2f
MLPClassifier: %.2f
""" % (svc_f1_hpo, rfc_f1_hpo, gnb_f1_hpo, knn_f1_hpo, mlp_f1_hpo))


After Hyperparameter Optimization
----------------------------------
SVC: 0.77
RandomForestClassifier: 0.80
GaussianNB: 0.76
KNeighborsClassifier: 0.78
MLPClassifier: 0.79

