In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import lightgbm as lgb
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_digits, fetch_covtype
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score
import itertools
from sklearn.preprocessing import label_binarize

# Data & Parameter


### Digits dataset

In [2]:
digits_data = load_digits()
X = digits_data.data
y = digits_data.target
df = pd.DataFrame(X, columns=digits_data.feature_names)
df['target'] = y
df.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_test_bin = label_binarize(y_test, classes=np.unique(y))

### Kaggle dataset

In [4]:
features = pd.read_csv('application_train.csv')
features = features.sample(n=16000, random_state=42)
features = features.select_dtypes('number')
labels = np.array(features['TARGET'].astype(np.int32)).reshape((-1, ))
features = features.drop(columns=['TARGET', 'SK_ID_CURR'])
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=6000, random_state=50)
features.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
245895,2,207000.0,465457.5,52641.0,418500.0,0.00963,-13297,-762,-637.0,-4307,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
98194,0,247500.0,1281712.5,48946.5,1179000.0,0.006852,-14778,-1141,-1610.0,-4546,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0
36463,0,202500.0,495000.0,39109.5,495000.0,0.035792,-17907,-639,-2507.0,-1461,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0
249923,0,247500.0,254700.0,24939.0,225000.0,0.04622,-19626,-6982,-11167.0,-3158,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
158389,0,112500.0,308133.0,15862.5,234000.0,0.01885,-20327,-1105,-7299.0,-494,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0


### Search Space

In [None]:
param_distributions_dt = {
    'max_depth': list(range(3, 16)),
    'min_samples_split': list(range(2, 21)),
    'min_samples_leaf': list(range(1, 11)),
    'criterion': ['gini', 'entropy']
}

param_distributions_lgb = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': (0.005, 0.5),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1),
    'colsample_bytree': (0.6, 1),
    'subsample': (0.5, 1),
    'is_unbalance': [True, False]
}

param_distributions_svm = {
    'C': (0.0001, 1000),
    'gamma': (0.000001, 10),
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'class_weight': [None, 'balanced'],
    'shrinking': [True, False],
    'tol': (0.00001, 0.01),
    'probability': [True, False]
}

param_distributions_obj = {
    "x1": (0, 10),
    "x2": (0, 10),
    "x3": [1,2,3],
}

# Random Search

In [None]:
# digits - decision tree
n_iter_search = 20
dt_results = []

dt_clf = DecisionTreeClassifier(random_state=42)

dt_clf.fit(X_train, y_train)
y_pred_default = dt_clf.predict(X_test)
y_pred_default_proba = dt_clf.predict_proba(X_test)
default_accuracy = accuracy_score(y_test, y_pred_default)
default_roc_auc = roc_auc_score(y_test_bin, y_pred_default_proba, average='weighted', multi_class='ovr')

best_accuracy = 0
best_roc_auc = 0

best_params = {}

for i in range(n_iter_search):
    sampled_params = {
        'max_depth': random.choice(param_distributions_dt['max_depth']),
        'min_samples_split': random.choice(param_distributions_dt['min_samples_split']),
        'min_samples_leaf': random.choice(param_distributions_dt['min_samples_leaf']),
        'criterion': random.choice(param_distributions_dt['criterion']),
    }

    dt_clf = DecisionTreeClassifier(
        max_depth=sampled_params['max_depth'],
        min_samples_split=sampled_params['min_samples_split'],
        min_samples_leaf=sampled_params['min_samples_leaf'],
        criterion=sampled_params['criterion'],
        random_state=42
    )

    dt_clf.fit(X_train, y_train)
    y_pred = dt_clf.predict(X_test)
    y_pred_proba = dt_clf.predict_proba(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test_bin, y_pred_proba, average='weighted', multi_class='ovr')
    # print(i, best_accuracy)

    if accuracy > best_accuracy or (accuracy == best_accuracy and roc_auc > best_roc_auc):
        best_accuracy = accuracy
        best_roc_auc = roc_auc
        best_params = sampled_params

    print(i, best_accuracy)

# Store results
dt_results.append({
    'dataset': 'digits',
    'best_params': best_params,
    'accuracy': best_accuracy,
    'roc_auc': best_roc_auc,
    'default_accuracy': default_accuracy,
    'default_roc_auc': default_roc_auc,
    'n_iter': n_iter_search
})

# Display the results for Decision Tree Randomized Search
print("\nDecisionTreeClassifier Manual Randomized Search Results:")
for result in dt_results:
    print(f"Dataset: {result['dataset']}")
    print(f"Best Hyperparameters: {result['best_params']}")
    print(f"Tuned Accuracy: {result['accuracy']}")
    print(f"Tuned ROC AUC: {result['roc_auc']}")
    print(f"Default Accuracy: {result['default_accuracy']}")
    print(f"Default ROC AUC: {result['default_roc_auc']}")
    print(f"Number of Iterations: {result['n_iter']}")
    print()

0 0.8305555555555556
1 0.8305555555555556
2 0.8305555555555556
3 0.8722222222222222
4 0.8722222222222222
5 0.8722222222222222
6 0.8722222222222222
7 0.8722222222222222
8 0.8722222222222222
9 0.875
10 0.875
11 0.875
12 0.875
13 0.875
14 0.875
15 0.875
16 0.875
17 0.875
18 0.875
19 0.875

DecisionTreeClassifier Manual Randomized Search Results:
Dataset: digits
Best Hyperparameters: {'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 5, 'criterion': 'entropy'}
Tuned Accuracy: 0.875
Tuned ROC AUC: 0.961669239493184
Default Accuracy: 0.8416666666666667
Default ROC AUC: 0.9117326702800151
Number of Iterations: 20



DecisionTreeClassifier Manual Randomized Search Results:\
Dataset: digits\
Best Hyperparameters: {'max_depth': 13, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy'}\
Tuned Accuracy: 0.8888888888888888\
Tuned ROC AUC: 0.9434490095095995\
Default Accuracy: 0.8416666666666667\
Default ROC AUC: 0.9117326702800151\
Number of Iterations: 100

In [None]:
# kaggel dataset - decision tree
n_iter_search = 100
dt_results = []

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(train_features, train_labels)

y_pred_default = dt_clf.predict(test_features)
y_pred_default_proba = dt_clf.predict_proba(test_features)[:, 1]
default_accuracy = accuracy_score(test_labels, y_pred_default)
default_roc_auc = roc_auc_score(test_labels, y_pred_default_proba)

best_accuracy = 0
best_roc_auc = 0
best_params = {}

for i in range(n_iter_search):
    sampled_params = {
        'max_depth': random.choice(param_distributions_dt['max_depth']),
        'min_samples_split': random.choice(param_distributions_dt['min_samples_split']),
        'min_samples_leaf': random.choice(param_distributions_dt['min_samples_leaf']),
        'criterion': random.choice(param_distributions_dt['criterion']),
    }

    dt_clf = DecisionTreeClassifier(
        max_depth=sampled_params['max_depth'],
        min_samples_split=sampled_params['min_samples_split'],
        min_samples_leaf=sampled_params['min_samples_leaf'],
        criterion=sampled_params['criterion'],
        random_state=42
    )

    dt_clf.fit(train_features, train_labels)
    y_pred = dt_clf.predict(test_features)
    y_pred_proba = dt_clf.predict_proba(test_features)[:, 1]
    accuracy = accuracy_score(test_labels, y_pred)
    roc_auc = roc_auc_score(test_labels, y_pred_proba)

    if accuracy > best_accuracy or (accuracy == best_accuracy and roc_auc > best_roc_auc):
        best_accuracy = accuracy
        best_roc_auc = roc_auc
        best_params = sampled_params

# Store results
dt_results.append({
    'dataset': 'application_train.csv',
    'best_params': best_params,
    'accuracy': best_accuracy,
    'roc_auc': best_roc_auc,
    'default_accuracy': default_accuracy,
    'default_roc_auc': default_roc_auc,
    'n_iter': n_iter_search
})

# Display the results for Decision Tree Randomized Search
print("\nDecisionTreeClassifier Manual Randomized Search Results:")
for result in dt_results:
    print(f"Dataset: {result['dataset']}")
    print(f"Best Hyperparameters: {result['best_params']}")
    print(f"Tuned Accuracy: {result['accuracy']}")
    print(f"Tuned ROC AUC: {result['roc_auc']}")
    print(f"Default Accuracy: {result['default_accuracy']}")
    print(f"Default ROC AUC: {result['default_roc_auc']}")
    print(f"Number of Iterations: {result['n_iter']}")
    print()


DecisionTreeClassifier Manual Randomized Search Results:
Dataset: application_train.csv
Best Hyperparameters: {'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Tuned Accuracy: 0.919
Tuned ROC AUC: 0.6923243267044903
Default Accuracy: 0.8478333333333333
Default ROC AUC: 0.5335162571591058
Number of Iterations: 100



DecisionTreeClassifier Manual Randomized Search Results:\
Dataset: application_train.csv\
Best Hyperparameters: {'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}\
Tuned Accuracy: 0.919\
Tuned ROC AUC: 0.6923243267044903\
Default Accuracy: 0.8478333333333333\
Default ROC AUC: 0.5335162571591058\
Number of Iterations: 100


In [None]:
# kaggle dataset - lgb
n_iter_search = 100
lgb_results = []

lgb_clf = lgb.LGBMClassifier(random_state=42, force_col_wise=True, verbose=-1)

lgb_clf.fit(train_features, train_labels)
y_pred_default = lgb_clf.predict(test_features)
default_accuracy = accuracy_score(test_labels, y_pred_default)
default_roc_auc = roc_auc_score(test_labels, lgb_clf.predict_proba(test_features)[:, 1])

best_accuracy = 0
best_roc_auc = 0
best_params = {}

for i in range(n_iter_search):
    sampled_params = {
        'boosting_type': random.choice(param_distributions_lgb['boosting_type']),
        'num_leaves': random.choice(param_distributions_lgb['num_leaves']),
        'learning_rate': random.uniform(*param_distributions_lgb['learning_rate']),
        'subsample_for_bin': random.choice(param_distributions_lgb['subsample_for_bin']),
        'min_child_samples': random.choice(param_distributions_lgb['min_child_samples']),
        'reg_alpha': random.uniform(*param_distributions_lgb['reg_alpha']),
        'reg_lambda': random.uniform(*param_distributions_lgb['reg_lambda']),
        'colsample_bytree': random.uniform(*param_distributions_lgb['colsample_bytree']),
        'is_unbalance': random.choice(param_distributions_lgb['is_unbalance'])
    }

    # if sampled_params['boosting_type'] == 'goss':
    #     sampled_params['subsample'] = 1.0
    # else:
    #     sampled_params['subsample'] = random.choice(param_grid['subsample'])

    lgb_clf = lgb.LGBMClassifier(
        boosting_type=sampled_params['boosting_type'],
        num_leaves=sampled_params['num_leaves'],
        learning_rate=sampled_params['learning_rate'],
        subsample_for_bin=sampled_params['subsample_for_bin'],
        min_child_samples=sampled_params['min_child_samples'],
        reg_alpha=sampled_params['reg_alpha'],
        reg_lambda=sampled_params['reg_lambda'],
        colsample_bytree=sampled_params['colsample_bytree'],
        # subsample=sampled_params['subsample'],
        is_unbalance=sampled_params['is_unbalance'],
        random_state=42,
        force_col_wise=True,
        verbose=-1
    )

    lgb_clf.fit(train_features, train_labels)
    y_pred = lgb_clf.predict(test_features)
    accuracy = accuracy_score(test_labels, y_pred)
    roc_auc = roc_auc_score(test_labels, lgb_clf.predict_proba(test_features)[:, 1])

    if accuracy > best_accuracy or (accuracy == best_accuracy and roc_auc > best_roc_auc):
        best_accuracy = accuracy
        best_roc_auc = roc_auc
        best_params = sampled_params

lgb_results.append({
    'dataset': 'application_train.csv',
    'best_params': best_params,
    'accuracy': best_accuracy,
    'roc_auc': best_roc_auc,
    'default_accuracy': default_accuracy,
    'default_roc_auc': default_roc_auc,
    'n_iter': n_iter_search
})

# Display the results for LightGBM Randomized Search
print("\nLightGBM Manual Randomized Search Results:")
for result in lgb_results:
    print(f"Dataset: {result['dataset']}")
    print(f"Best Hyperparameters: {result['best_params']}")
    print(f"Tuned Accuracy: {result['accuracy']}")
    print(f"Tuned ROC AUC: {result['roc_auc']}")
    print(f"Default Accuracy: {result['default_accuracy']}")
    print(f"Default ROC AUC: {result['default_roc_auc']}")
    print(f"Number of Iterations: {result['n_iter']}")
    print()


LightGBM Manual Randomized Search Results:
Dataset: application_train.csv
Best Hyperparameters: {'boosting_type': 'gbdt', 'num_leaves': 35, 'learning_rate': 0.02210978994109018, 'subsample_for_bin': 40000, 'min_child_samples': 455, 'reg_alpha': 0.8680259780481667, 'reg_lambda': 0.2541190497499861, 'colsample_bytree': 0.7688300660703605, 'is_unbalance': False}
Tuned Accuracy: 0.919
Tuned ROC AUC: 0.7409795641770816
Default Accuracy: 0.916
Default ROC AUC: 0.7119893842982546
Number of Iterations: 100



LightGBM Manual Randomized Search Results:\
Dataset: application_train.csv\
Best Hyperparameters: {'boosting_type': 'gbdt', 'num_leaves': 35, 'learning_rate': 0.02210978994109018, 'subsample_for_bin': 40000, 'min_child_samples': 455, 'reg_alpha': 0.8680259780481667, 'reg_lambda': 0.2541190497499861, 'colsample_bytree': 0.7688300660703605, 'is_unbalance': False}\
Tuned Accuracy: 0.919\
Tuned ROC AUC: 0.7409795641770816\
Default Accuracy: 0.916\
Default ROC AUC: 0.7119893842982546\
Number of Iterations: 100

In [None]:
param_distributions_obj = {
    "x1": (0, 10),
    "x2": (0, 10),
    "x3": [1,2,3],
}

def objective_function(params):
    x1, x2, x3 = params['x1'], params['x2'], params['x3']
    return -((x1 - 3)**2 + (x2 - 5)**2 - x3 + np.random.normal(0, 0.1))

n_iter = 20
all_params = []
all_results = []

best_params = None
best_value = float('inf')

for i in range(n_iter):
    sampled_params = {
        'x1': random.uniform(0, 10),
        'x2': random.uniform(0, 10),
        'x3': random.choice([1, 2, 3])
    }

    value = objective_function(sampled_params)

    all_params.append(sampled_params)
    all_results.append(value)

    if value < best_value:
        best_value = value
        best_params = sampled_params

for i in range(n_iter):
    print(f"Iteration {i+1}: Params = {all_params[i]}, Objective Value = {all_results[i]}")


Iteration 1: Params = {'x1': 8.014669648298035, 'x2': 1.4997416921347784, 'x3': 2}, Objective Value = -35.39358931490635
Iteration 2: Params = {'x1': 8.527242568781134, 'x2': 2.5260501283287775, 'x3': 1}, Objective Value = -35.764081344819395
Iteration 3: Params = {'x1': 6.7358984612174595, 'x2': 0.6000051146780694, 'x3': 1}, Objective Value = -32.477607180753424
Iteration 4: Params = {'x1': 3.050611366766721, 'x2': 4.368568072912441, 'x3': 1}, Objective Value = 0.6390531898387992
Iteration 5: Params = {'x1': 5.607597671725423, 'x2': 1.4520640004443652, 'x3': 3}, Objective Value = -16.387794883484492
Iteration 6: Params = {'x1': 8.997413325565557, 'x2': 9.152722120966654, 'x3': 2}, Objective Value = -51.131505659550974
Iteration 7: Params = {'x1': 0.06916840400217006, 'x2': 0.5894038049574968, 'x3': 2}, Objective Value = -26.046825652228225
Iteration 8: Params = {'x1': 3.4972472248647737, 'x2': 8.994807021731011, 'x3': 3}, Objective Value = -13.14529466789477
Iteration 9: Params = {'x1

# Grid Search

In [None]:
# digits - decision tree
keys, values = zip(*param_distributions_dt.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

dt_results = []

dt_clf = DecisionTreeClassifier(random_state=42)

dt_clf.fit(X_train, y_train)
y_pred_default = dt_clf.predict(X_test)
y_pred_default_proba = dt_clf.predict_proba(X_test)
default_accuracy = accuracy_score(y_test, y_pred_default)
default_roc_auc = roc_auc_score(y_test_bin, y_pred_default_proba, average='weighted', multi_class='ovr')

best_accuracy = 0
best_roc_auc = 0
best_params = {}

for params in param_combinations:
    dt_clf = DecisionTreeClassifier(
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        criterion=params['criterion'],
        random_state=42
    )

    dt_clf.fit(X_train, y_train)
    y_pred = dt_clf.predict(X_test)
    y_pred_proba = dt_clf.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test_bin, y_pred_proba, average='weighted', multi_class='ovr')

    if accuracy > best_accuracy or (accuracy == best_accuracy and roc_auc > best_roc_auc):
        best_accuracy = accuracy
        best_roc_auc = roc_auc
        best_params = params

dt_results.append({
    'dataset': 'digits',
    'best_params': best_params,
    'accuracy': best_accuracy,
    'roc_auc': best_roc_auc,
    'default_accuracy': default_accuracy,
    'default_roc_auc': default_roc_auc,
    'n_iter': len(param_combinations)
})

print("\nDecisionTreeClassifier Manual Grid Search Results:")
for result in dt_results:
    print(f"Dataset: {result['dataset']}")
    print(f"Best Hyperparameters: {result['best_params']}")
    print(f"Tuned Accuracy: {result['accuracy']}")
    print(f"Tuned ROC AUC: {result['roc_auc']}")
    print(f"Default Accuracy: {result['default_accuracy']}")
    print(f"Default ROC AUC: {result['default_roc_auc']}")
    print(f"Number of Iterations: {result['n_iter']}")
    print()

DecisionTreeClassifier Manual Grid Search Results:\
Dataset: digits\
Best Hyperparameters: {'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'entropy'}\
Tuned Accuracy: 0.8944444444444445\
Tuned ROC AUC: 0.9447431743279776\
Default Accuracy: 0.8416666666666667\
Default ROC AUC: 0.9117326702800151\
Number of Iterations: 4940

# Bayesian Optimization

In [None]:
import numpy as np
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
class bayesian_optimization():
    def __init__(self, objective, space, iterations=30, explore='ei', min_start_size=10):
        self.objective = objective
        self.space = space
        self.feature_name = list(space.keys())
        self.iterations = iterations
        self.model = GaussianProcessRegressor(kernel=RBF(), alpha=1e-6, normalize_y=True)
        self.explore = self.exploration_strategy(explore)
        self.min_start_size = min_start_size
        self.X = []
        self.Y = []
        self.preprocess()

    def preprocess(self):
        self.feature_num = len(self.space)
        self.feature_type = {}
        for feature, value in self.space.items():
            if isinstance(value, list):
                self.feature_type[feature] = 'categorical'
            else:
                assert(isinstance(value, tuple))
                self.feature_type[feature] = 'continuous'

    def feature_dict_to_array(self, feature_dicts):
        # print(feature_dicts)
        feature_array = np.zeros((len(feature_dicts), self.feature_num))
        for i, feature in enumerate(self.feature_name):
            for j, feature_dict in enumerate(feature_dicts):
                feature_array[j, i] = feature_dict[feature]
        return feature_array

    def feature_array_to_dict(self, feature_array):
        feature_dict = []
        for i in range(feature_array.shape[0]):
            feature_dict.append(dict(zip(self.feature_name, feature_array[i])))
        return feature_dict


    def random_sample(self, sample_size=5):
        from tqdm import tqdm
        candidates = {}
        # for feature, value in tqdm(self.space.items()):
        for feature, value in self.space.items():
            if self.feature_type[feature] == 'categorical':
                candidates[feature] = np.random.choice(value, sample_size)
            else:
                assert(self.feature_type[feature] == 'continuous')
                if value[1] > 1:
                    candidates[feature] = np.random.randint(value[0], value[1], sample_size)
                else:
                    candidates[feature] = np.random.uniform(value[0], value[1], sample_size)
        ret = []
        for i in range(sample_size):
            ret.append({key: candidates[key][i] for key in candidates})
        return ret

    def exploration_strategy(self, explore='ei'):
        if explore == 'ei':
            def ei(x, gp, y_min):
                x = np.array(x)
                mu, sigma = gp.predict(x, return_std=True)
                sigma += 1e-9
                improvement = y_min - mu
                z = np.array(improvement) / np.array(sigma)
                return improvement * norm.cdf(z) + sigma * norm.pdf(z)
            return ei
        elif explore == 'greedy':
            def greedy(x, gp, _):
                mu, _ = gp.predict(x, return_std=True)
                return -mu
            return greedy
        elif explore == 'ucb':
            def ucb(x, gp, _):
                mu, sigma = gp.predict(x, return_std=True)
                return mu + 1.96 * sigma
            return ucb
        else:
            raise ValueError("Unsupported exploration strategy. Use 'ei', 'greedy', or 'ucb'.")

    def get_next_point(self, sample_size=5, return_size=1):
        candidates = self.random_sample(sample_size)
        candidates = self.feature_dict_to_array(candidates)
        acq_values = self.explore(candidates, self.model, min(self.Y))
        best_indices = np.argsort(acq_values)[-return_size:]
        return candidates[best_indices]

    def update_model(self):
        self.model.fit(self.feature_dict_to_array(self.X), np.array(self.Y))

    def run(self):
        from tqdm import tqdm
        for i in tqdm(range(self.iterations)):
            # print(f"Iteration {i+1}/{self.iterations}")
            if len(self.X) < self.min_start_size:
                x_next = self.random_sample(self.min_start_size)
            else:
                x_next = self.get_next_point()
                x_next = self.feature_array_to_dict(x_next)

            y_next = []
            for x in x_next:
                y_next.append(self.objective(x))
            # print(y_next)
            self.X.extend(x_next)
            self.Y.extend(y_next)
            self.update_model()
            print(f"iter{i+1}, Best value so far: {min(self.Y)}")

        best_idx = np.argmin(self.Y)
        return {"best_x": self.X[best_idx], "best_y": self.Y[best_idx]}


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_digits, fetch_covtype
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
import itertools

from sklearn.svm import SVC

param_distributions_svm = {
    'C': (0.0001, 1000),
    'gamma': (0.000001, 10),
    'kernel': [0, 1, 2],
    'class_weight': [0, 1],
    'shrinking': [0, 1],
    'tol': (0.00001, 0.01),
    'probability': [0, 1]
}

# from sklearn.datasets import fetch_20newsgroups
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kernel_array = ['poly', 'rbf', 'sigmoid']
class_weight_array = [None, 'balanced']
shrinking_array = [True, False]
probability_array = [True, False]


def objective_svm(params):
    params2 = params.copy()
    params2['kernel'] = kernel_array[int(params['kernel'])]
    params2['C'] = float(params['C'])
    params2['class_weight'] = class_weight_array[int(params['class_weight'])]
    params2['shrinking'] = shrinking_array[int(params['shrinking'])]
    params2['probability'] = probability_array[int(params['probability'])]

    model = SVC(**params2)
    # model.fit(train_features, train_labels)
    # y_pred = model.predict(test_features)
    # accuracy = accuracy_score(test_labels, y_pred)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy

bo_svm = bayesian_optimization(
    objective=objective_svm,
    space=param_distributions_svm,
    iterations=25,
    explore='ei',
    min_start_size=1,
)

result_svm = bo_svm.run()
print("SVM最佳超参数配置:", result_svm["best_x"])
print("SVM最佳目标值:", result_svm["best_y"])


  0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
param_distributions_lgb = {
    # 'boosting_type': ['gbdt', 'goss', 'dart'],
    'boosting_type': [0, 1, 2],
    'num_leaves': (20, 150),
    'learning_rate': (0.005, 0.5),
    'subsample_for_bin': (20000, 300000),
    'min_child_samples': (20, 500),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1),
    'colsample_bytree': (0.6, 1),
    'subsample': (0.5, 1),
    # 'is_unbalance': [True, False]
    'is_unbalance': [0, 1]
}
type_array = ['gbdt', 'goss', 'dart']
unbalance_array = [True, False]
def objective_lgb(params):
    params2 = params.copy()
    params2['boosting_type'] = type_array[int(params['boosting_type'])]
    params2['is_unbalance'] = unbalance_array[int(params['is_unbalance'])]
    params2['num_leaves'] = int(params['num_leaves'])
    params2['subsample_for_bin'] = int(params['subsample_for_bin'])
    params2['min_child_samples'] = int(params['min_child_samples'])
    model = lgb.LGBMClassifier(**params2, random_state=42, force_col_wise=True, verbose=-1)
    model.fit(train_features, train_labels)
    y_pred = model.predict(test_features)
    return -accuracy_score(test_labels, y_pred)

bo = bayesian_optimization(
    objective=objective_lgb,
    space=param_distributions_lgb,
    iterations=25,
    explore='ei',
    min_start_size=5,
)

result = bo.run()

# 打印结果
print("最佳超参数配置:", result["best_x"])
print("最佳目标值:", result["best_y"])

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


best config: [2.97526837 5.03082013]
best result: -9.02749861465481e-05


In [None]:
# digits - decision tree

In [None]:
# kaggle dataset - decision tree

In [None]:
# kaggle dataset - gbm

# Q-Learning

In [None]:
class q_learning():
    def __init__(self, alpha, gamma, epsilon, proportional_factor, max_iterations,
                 space, model, X_train, y_train, X_test, y_test):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.proportional_factor = proportional_factor
        self.max_iterations = max_iterations
        self.space = space
        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.actions = {key: ["increase", "stay", "decrease"] for key in self.space.keys()}

    def state_selection(self):
        state = {}
        for key, values in self.space.items():
            if type(values) == list:
                state[key] = random.choice(values)
            else:
                state[key] = random.uniform(values[0], values[1])
        return state

    def exploration_strategy(self, state):
        action_set = {}
        for key in self.space.keys():
            if random.uniform(0, 1) < self.epsilon:
                action_set[key] = random.choice(self.actions[key])
            else:
                q_values = self.q_table.get(tuple(state.items()), {}).get(key, {})
                action_set[key] = max(q_values, key=q_values.get, default="stay")
            self.epsilon *= 0.999
        return action_set

    def apply_action(self, state, action_set):
        new_state = state.copy()

        for key, action in action_set.items():
            if type(self.space[key]) == list:
                value_list = self.space[key]
                current_index = value_list.index(new_state[key])

                if action == "increase":
                    new_index = min(current_index + 1, len(value_list) - 1)
                elif action == "decrease":
                    new_index = max(current_index - 1, 0)
                else:
                    new_index = current_index

                new_state[key] = value_list[new_index]

            else:
                min_val, max_val = self.space[key]
                current_value = new_state[key]
                self.proportional_factor *= 0.99

                if action == "increase":
                    new_state[key] = min((1 + self.proportional_factor) * current_value, max_val)
                elif action == "decrease":
                    new_state[key] = max((1 - self.proportional_factor) * current_value, min_val)
                else:
                    new_state[key] = current_value
        return new_state

    def get_reward(self, state):
        params = {k: v for k, v in state.items()}

        if self.model == "dt":
            model = DecisionTreeClassifier(**params, random_state=42)
            model.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            return accuracy_score(self.y_test, y_pred)
        else:
            return -((state['x1'] - 3)**2 + (state['x2'] - 5)**2  - (state['x3']) + np.random.normal(0, 0.1))

    def update_q_table(self, state, action_set, reward, new_state):
        state_key = tuple(state.items())
        new_state_key = tuple(new_state.items())

        if state_key not in self.q_table:
            self.q_table[state_key] = {}
        for key, action in action_set.items():
            if key not in self.q_table[state_key]:
                self.q_table[state_key][key] = {}
            old_q_value = self.q_table[state_key][key].get(action, 0)
            future_q_value = max(
                self.q_table.get(new_state_key, {}).get(key, {}).values(), default=0
            )
            new_q_value = old_q_value + self.alpha * (reward + self.gamma * future_q_value - old_q_value)
            self.q_table[state_key][key][action] = new_q_value

    def run(self):
        for iteration in range(self.max_iterations):
            state = {key: random.choice(values) for key, values in self.space.items()}

            for _ in range(100):
                action_set = self.exploration_strategy(state)
                new_state = self.apply_action(state, action_set)
                reward = self.get_reward(new_state)
                self.update_q_table(state, action_set, reward, new_state)
                state = new_state
        print(len(self.q_table))

    def get_best_hyperparameters(self):
        self.run()
        best_state = max(
            (s for s in self.q_table if self.q_table[s]),
            key=lambda s: sum(max(self.q_table[s][k].values(), default=0) for k in self.q_table[s]),
            default=None
        )
        return dict(best_state)

In [None]:
# obj function
params = []
results = []
for _ in range(10):
    ql = q_learning(
        alpha = 0.1,
        gamma = 0.9,
        epsilon = 0.9,
        proportional_factor = 0.1,
        max_iterations = 20,
        space = param_distributions_obj,
        model = "obj",
        X_train = X_train,
        y_train = y_train,
        X_test = X_test,
        y_test = y_test
    )
    result = ql.get_best_hyperparameters()
    loss = -((result['x1'] - 3)**2 + (result['x2'] - 5)**2  - (result['x3']) + np.random.normal(0, 0.1))

    params.append(result)
    results.append(loss)

print(params)
print(results)
print(max(results))

251
307
293
193
173
310
242
297
186
208
[{'x1': 0.0, 'x2': 9.660738000321029, 'x3': 2}, {'x1': 0.0, 'x2': 6.2129964532848785, 'x3': 3}, {'x1': 4.529635585505561, 'x2': 6.76942991422106, 'x3': 3}, {'x1': 0.0, 'x2': 9.999999999682311, 'x3': 2}, {'x1': 0.0, 'x2': 8.31837142031841, 'x3': 2}, {'x1': 0.0, 'x2': 9.844845852239574, 'x3': 2}, {'x1': 0.0, 'x2': 7.0780149854224605, 'x3': 3}, {'x1': 7.998077289440447, 'x2': 5.03982081954357, 'x3': 2}, {'x1': 0.0, 'x2': 9.999999998209683, 'x3': 2}, {'x1': 0.0, 'x2': 9.343649606355054, 'x3': 3}]
[-28.655266166704706, -7.360572355462452, -2.496523487789181, -32.05893780729479, -18.104293723430466, -30.451764081070646, -10.274507617817937, -22.90105920367276, -31.94459964484592, -24.75964627251292]
-2.496523487789181


In [None]:
dt_result = [0.8722222222222222, 0.8694444444444445, 0.8694444444444445, 0.8722222222222222, 0.875, 0.8888888888888888, 0.8888888888888888, 0.875, 0.8361111111111111, 0.8777777777777778]

mean = np.mean(dt_result)
minimum = np.min(dt_result)
maximum = np.max(dt_result)
std_dev = np.std(dt_result)

print("Mean:", mean)
print("Minimum:", minimum)
print("Maximum:", maximum)
print("Standard Deviation:", std_dev)

Mean: 0.8724999999999999
Minimum: 0.8361111111111111
Maximum: 0.8888888888888888
Standard Deviation: 0.013858299648073781


In [None]:
obj_result = [-28.655266166704706, -7.360572355462452, -2.496523487789181, -32.05893780729479, -18.104293723430466, -30.451764081070646, -10.274507617817937, -22.90105920367276, -31.94459964484592, -24.75964627251292]
mean = np.mean(obj_result)
minimum = np.min(obj_result)
maximum = np.max(obj_result)
std_dev = np.std(obj_result)

print("Mean:", mean)
print("Minimum:", minimum)
print("Maximum:", maximum)
print("Standard Deviation:", std_dev)

Mean: -20.90071703606018
Minimum: -32.05893780729479
Maximum: -2.496523487789181
Standard Deviation: 10.29323873933181


In [None]:
params = {
    'max_depth': 3,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'criterion': 'gini'
}

model = DecisionTreeClassifier(**params, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.4722222222222222
