In [1]:
!pip install -U scikit-learn xgboost albumentations bayesian-optimization

Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 9.8 kB/s 
Collecting albumentations
  Downloading albumentations-1.1.0-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 51.1 MB/s 
[?25hCollecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Collecting opencv-python-headless>=4.1.1
  Downloading opencv_python_headless-4.5.4.60-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.6 MB)
[K     |████████████████████████████████| 47.6 MB 1.4 MB/s 
Collecting qudida>=0.0.4
  Downloading qudida-0.0.4-py3-none-any.whl (3.5 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=ad13a95d1620274730c9bd5f64341d1991bae596b9e1a92da5c95227

In [3]:
import pandas as pd
import numpy as np
import os
import json
import pickle

import matplotlib.pyplot as plt

from sklearn.model_selection import ParameterSampler, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import sys
sys.path.append('./drive/MyDrive/spotify/notebooks/')
from utils import *

ROOT_PATH = "./drive/MyDrive/spotify/training/loud/10000/7/genres/"

class_order = [ 5, 20,  6, 30, 25, 26,  0,  1,  2, 27, 21, 15, 16, 17, 3, 22, 31, 7, 10, 11, 12, 13,  8, 23, 32, 28, 14, 29,  4, 18, 33,  9, 24, 34, 19]

In [8]:
rng = np.random.RandomState(42)

In [9]:
with open(ROOT_PATH + "X_train_all.pkl", 'rb') as f:
    X_train_all = pickle.load(f)

with open(ROOT_PATH + "y_train_all.pkl", 'rb') as f:
    y_train_all_u = pickle.load(f)

y_train_all = np.empty(y_train_all_u.shape)
for i, j in enumerate(class_order):
    y_train_all[:, j] = y_train_all_u[:, i]

Xy_train_all = np.hstack([X_train_all, y_train_all])
rng.shuffle(Xy_train_all)
X_train_all = Xy_train_all[:, :-35]
y_train_all = Xy_train_all[:, -35:]

In [10]:
with open(ROOT_PATH + "X_test_all.pkl", 'rb') as f:
    X_test_all = pickle.load(f)

with open(ROOT_PATH + "y_test_all.pkl", 'rb') as f:
    y_test_all_u = pickle.load(f)

y_test_all = np.empty(y_test_all_u.shape)
for i, j in enumerate(class_order):
    y_test_all[:, j] = y_test_all_u[:, i]

Xy_test_all = np.hstack([X_test_all, y_test_all])
rng.shuffle(Xy_test_all)
X_test_all = Xy_test_all[:, :-35]
y_test_all = Xy_test_all[:, -35:]

In [12]:
dtrains = [[None for _ in range(5)] for _ in range(7)]

for k in range(7):
    idx_train = y_train_all[:, k*5:(k+1)*5] == 1
    idx_train = idx_train.any(axis=1)
    X_train = X_train_all[idx_train]
    y_train = y_train_all[idx_train, k*5:(k+1)*5]

    for l in range(5):
        y_train_binary = y_train[:, l] == 1
        dtrains[k][l] = xgb.DMatrix(data=X_train, label=y_train_binary)

In [11]:
dtests = [[None for _ in range(5)] for _ in range(7)]

for k in range(7):
    idx_test = y_test_all[:, k*5:(k+1)*5] == 1
    idx_test = idx_test.any(axis=1)
    X_test = X_test_all[idx_test]
    y_test = y_test_all[idx_test, k*5:(k+1)*5]

    for l in range(5):
        y_test_binary = y_test[:, l] == 1
        dtests[k][l] = xgb.DMatrix(data=X_test, label=y_test_binary)

In [13]:
evals = [None] * 7
for k in range(7):
    evals[k] = [[(dtrain, 'train'), (dtest, 'test')] for dtrain, dtest in zip(dtrains[k], dtests[k])]

In [14]:
fixed_params = dict(
    tree_method = 'gpu_hist',
    objective = 'binary:logistic',
    eval_metric = 'logloss'
)
param_cols = ['colsample_bytree', 'gamma', 'learning_rate', 'max_depth', 'min_child_weight', 'subsample']

In [22]:
rng = np.random.RandomState(42)
for _ in range(30):
    rng.randint(1e6)

In [23]:
param_grid = dict(
    learning_rate = (0.05, 0.2),
    max_depth = (2, 12),
    min_child_weight = (1, 8),
    colsample_bytree = (0.4, 1.0),
    gamma = (0, 10),
    subsample = (0.4, 1),
)

for k in range(6, 7):
    for l in range(5):
        def xgb_cv_wrapper(colsample_bytree, gamma, learning_rate, max_depth, min_child_weight, subsample):
            params = fixed_params.copy()
            params['colsample_bytree'] = colsample_bytree
            params['gamma'] = gamma
            params['learning_rate'] = learning_rate
            params['max_depth'] = int(max_depth)
            params['min_child_weight'] = min_child_weight
            params['subsample'] = subsample

            res = xgboost_cv_single(
                train = dtrains[k][l],
                params = params,
                num_rounds = 100,
                metric_name = 'logloss',
            )
            return -res

        optimizer = BayesianOptimization(
            f = xgb_cv_wrapper,
            pbounds = param_grid,
            random_state = rng.randint(1e6),
            verbose = 2,
        )

        logger = JSONLogger(path=f"./drive/MyDrive/spotify/results_bayes_simple_test/genres/logs_{k}_{l}.json")
        optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

        optimizer.maximize(
            n_iter = 40,
            init_points = 5
        )

In [46]:
n = 3
for k in range(4, 6):
    for l in range(5):
        res = []
        with open(f"./drive/MyDrive/spotify/results_bayes_simple_test/genres/logs_{k}_{l}.json", 'r') as f:
            for row in f:
                res.append(json.loads(row))

        res = pd.DataFrame(res)
        res = res.sort_values('target', ascending=False)
        top_params =  res.head(n)['params'].to_list()

        evals_result = [{} for _ in range(n)]
        best_model, best_score = None, np.inf
        train_steps, test_steps = None, None

        for i, temp_params in enumerate(top_params):
            params = {**fixed_params, **temp_params}
            params['max_depth'] = int(params['max_depth'])
            params['learning_rate'] = params['learning_rate']/1.5
            bst = xgb.train(
                params,
                dtrains[k][l],
                num_boost_round = 10000,
                early_stopping_rounds = 5,
                evals = evals[k][l],
                evals_result = evals_result[i],
                verbose_eval = False
            )
            if evals_result[i]['test']['logloss'][-1] < best_score:
                best_score = evals_result[i]['test']['logloss'][-1]
                best_model = bst
                train_steps = evals_result[i]['train']['logloss']
                test_steps = evals_result[i]['test']['logloss']
        
        x = range(len(train_steps))
        fig, ax = plt.subplots(figsize=(10,8))
        ax.plot(x, train_steps, label='train')
        ax.plot(x, test_steps, label='test')
        fig.legend()
        fig.savefig(f"./drive/MyDrive/spotify/results_bayes_simple_test/genres/curves/{k}_{l}.png", bbox_inches='tight', dpi=150)
        plt.close()

        best_model.save_model(f"./drive/MyDrive/spotify/results_bayes_simple_test/genres/models/model_{k}_{l}.json")
