In [16]:
import optuna
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from dask.distributed import wait
import time

In [131]:
client.close()
cluster.close()

In [141]:
def xgb_hpo():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    print(study.best_value)




In [142]:
%time xgb_hpo()

[I 2024-04-20 21:00:42,483] A new study created in memory with name: no-name-a044cc00-bf47-4cdb-8e42-eb653fa7ccae
[I 2024-04-20 21:00:42,630] Trial 0 finished with value: 0.6274025772395591 and parameters: {'lambda': 25.71617114818157, 'alpha': 41.42960811496113, 'colsample_bytree': 0.24922510700116957, 'max_depth': 2, 'min_child_weight': 3.31947921876954e-07, 'learning_rate': 0.022869705005183524, 'gamma': 1.179576060938227e-05}. Best is trial 0 with value: 0.6274025772395591.
[I 2024-04-20 21:00:43,016] Trial 1 finished with value: 0.6274025772395591 and parameters: {'lambda': 0.0002707385386426921, 'alpha': 1.1247979211013816e-05, 'colsample_bytree': 0.7695772364671551, 'max_depth': 5, 'min_child_weight': 0.04053942012956267, 'learning_rate': 2.6748701742614733e-05, 'gamma': 0.6123366831156039}. Best is trial 0 with value: 0.6274025772395591.
[I 2024-04-20 21:00:43,130] Trial 2 finished with value: 0.6274025772395591 and parameters: {'lambda': 6.860510295645791, 'alpha': 5.320353128

0.9666666666666668
CPU times: total: 18.9 s
Wall time: 25.1 s


In [132]:
import dask
from dask.distributed import Scheduler, Client, LocalCluster
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
cluster = LocalCluster(n_workers=1,threads_per_worker=1)
client = Client(cluster)
client


Port 8787 is already in use.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 50467 instead



0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:50467/status,

0,1
Dashboard: http://127.0.0.1:50467/status,Workers: 1
Total threads: 1,Total memory: 15.89 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:50468,Workers: 1
Dashboard: http://127.0.0.1:50467/status,Total threads: 1
Started: Just now,Total memory: 15.89 GiB

0,1
Comm: tcp://127.0.0.1:50476,Total threads: 1
Dashboard: http://127.0.0.1:50477/status,Memory: 15.89 GiB
Nanny: tcp://127.0.0.1:50471,
Local directory: C:\Users\dilip\AppData\Local\Temp\dask-scratch-space\worker-i_lhinsu,Local directory: C:\Users\dilip\AppData\Local\Temp\dask-scratch-space\worker-i_lhinsu


In [133]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, cross_val_score
def objective(trial):
    X, y = load_breast_cancer(return_X_y=True)
    params = {
        "n_estimators": 10,
        "verbosity": 0,
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 100.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 100.0, log=True),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "max_depth": trial.suggest_int("max_depth", 2, 10, step=1),
        # minimum child weight, larger the term more conservative the tree.
        "min_child_weight": trial.suggest_float(
            "min_child_weight", 1e-8, 100, log=True
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
        # defines how selective algorithm is.
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "grow_policy": "depthwise",
        "eval_metric": "logloss",
    }
    clf = xgb.XGBClassifier(**params)
    fold = KFold(n_splits=5, shuffle=True, random_state=0)
    score = cross_val_score(clf, X, y, cv=fold, scoring="accuracy")
    return score.mean()

In [134]:
backend_storage = optuna.storages.InMemoryStorage()
dask_storage = optuna.integration.DaskStorage(storage=backend_storage)
def xgb_hpo_dask():
    study_dask = optuna.create_study(direction='maximize',storage=dask_storage,sampler=optuna.samplers.TPESampler(),pruner=optuna.pruners.MedianPruner())
    futures = [client.submit(study_dask.optimize, objective, n_trials=1, pure=False) for _ in range(50)]
    _ = wait(futures)
    print(study_dask.best_value)
    return study_dask


DaskStorage is experimental (supported from v3.1.0). The interface can change in the future.



In [135]:
from dask.distributed import performance_report
with performance_report(filename="dask-report-1-1.html"):
    %time study = xgb_hpo_dask()

[I 2024-04-20 20:55:41,850] A new study created in memory with name: no-name-589cbf80-bc17-4f3f-a2b1-1aac1c3f25d0

DaskStorage is experimental (supported from v3.1.0). The interface can change in the future.



0.9649122807017545
CPU times: total: 8.53 s
Wall time: 39.3 s


In [24]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_rank
from optuna.visualization import plot_slice
from optuna.visualization import plot_timeline

In [136]:
plot_rank(study)


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.



In [137]:
plot_optimization_history(study)

In [138]:
plot_parallel_coordinate(study)

In [139]:
plot_param_importances(study)

In [140]:
client.close()
cluster.close()

In [63]:
import dask.array as da
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
a = da.random.random(size=(10000, 1000), chunks=(1000, 1000))
q, r = da.linalg.qr(a)
a2 = q.dot(r)

with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,CacheProfiler() as cprof:
    out = a2.compute()

In [64]:
from dask.diagnostics import visualize
visualize([prof, rprof, cprof])

-----------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
%time
data = pd.read_csv("Parking_Violations.csv")
data.head()

In [26]:
import dask.dataframe as dd
%time 
df = dd.read_csv('Parking_Violations.csv')
df = df.iloc[500:5000,:]

CPU times: total: 0 ns
Wall time: 0 ns


NotImplementedError: 'DataFrame.iloc' only supports selecting columns. It must be used like 'df.iloc[:, column_indexer]'.