In [3]:
import dask.array as da
import xgboost as xgb
from dask_ml.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.datasets import fetch_covtype
from dask_ml.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from dask.distributed import LocalCluster

### Task 1
Uruchom przykład Incremental learning z punktu 2.1 dobierając parametry tak, aby ilość danych do przeliczenia była większa niż sumaryczna ilość pamięci RAM workerów. Obserwuj daszboard i w razie niepowodzenia dostosuj wielkość i ilość chunków tak, aby obliczenia się wykonały na tych samych parametrach workerów. Zobacz jak wygląda struktura pamięci na workerach, czy nie dochodzi do zrzucania pamięci na dysk (zapewne będzie on wąskim gardłem, więc w menedżerze będzie widać jego mocne obciążenie). Zastanów się czy można to jakoś zoptymalizować przy dostępnych workerach i wykonaj kilka eksperymentów szukając większej wydajności i krótszego czasu wykonania całego zadania.

In [2]:
cov = fetch_covtype()
X, y = cov.data, cov.target

le = LabelEncoder()
y = le.fit_transform(y)

In [3]:
X = da.from_array(X, chunks=(X.shape[0] // 64, X.shape[1]))
y = da.from_array(y, chunks=(y.shape[0] // 64,))

In [4]:
params = {'objective': 'multi:softmax',
          'max_depth': 4, 'eta': 0.01, 'subsample': 0.5,
          'min_child_weight': 0.5,
          'num_class': 7}

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=94)

predictions = {}
models = []

with LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='1GB') as cluster:
    display(cluster)
    with cluster.get_client() as client:
        for i, (train, test) in enumerate(cv.split(X, y)):
            X_train = X[train, :-1]
            X_test = X[test, :-1]
            y_train = y[train]
            y_test = y[test]

            d_train = xgb.dask.DaskDMatrix(client, X_train, y_train, enable_categorical=True)
            model = xgb.dask.train(client, params=params, dtrain=d_train)
            predictions[f'fold_{i}'] = xgb.dask.predict(client, model, X_test)
            models.append(model)

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 3.73 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:50887,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 3.73 GiB

0,1
Comm: tcp://127.0.0.1:50908,Total threads: 1
Dashboard: http://127.0.0.1:50914/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:50890,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-n7s_f81_,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-n7s_f81_

0,1
Comm: tcp://127.0.0.1:50907,Total threads: 1
Dashboard: http://127.0.0.1:50910/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:50892,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-22udvda8,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-22udvda8

0,1
Comm: tcp://127.0.0.1:50906,Total threads: 1
Dashboard: http://127.0.0.1:50912/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:50894,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-5da6ynu5,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-5da6ynu5

0,1
Comm: tcp://127.0.0.1:50909,Total threads: 1
Dashboard: http://127.0.0.1:50916/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:50896,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-s153n_da,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-s153n_da


  p = blockwise(
  p = blockwise(
Windows is not officially supported for dask/xgboost, contribution are welcomed.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
Windows is not officially supported for dask/xgboost, contribution are welcomed.
Windows is not officially supported for dask/xgboost, contribution are welcomed.
  p = blockwise(
  p = blockwise(
Windows is not officially supported for dask/xgboost, contribution are welcomed.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
Windows is not officially supported for dask/xgboost, contribution are w

In [5]:
predictions

{'fold_0': dask.array<mapped_predict, shape=(116203,), dtype=float32, chunksize=(9078,), chunktype=numpy.ndarray>,
 'fold_1': dask.array<mapped_predict, shape=(116203,), dtype=float32, chunksize=(9078,), chunktype=numpy.ndarray>,
 'fold_2': dask.array<mapped_predict, shape=(116202,), dtype=float32, chunksize=(9078,), chunktype=numpy.ndarray>,
 'fold_3': dask.array<mapped_predict, shape=(116202,), dtype=float32, chunksize=(9078,), chunktype=numpy.ndarray>,
 'fold_4': dask.array<mapped_predict, shape=(116202,), dtype=float32, chunksize=(9078,), chunktype=numpy.ndarray>}

### Task 2
Dokonaj serializacji modelu z zadania 1 na dysk i następnie go wczytaj ponownie tak, aby można było uruchomić na nim predykcję dla tablic X_test oraz y_test (dla użycia miar klasyfikacji) i wyświetl macierz klasyfikacji (confusion matrix).

In [9]:
model_zero = models[0]['booster']
model_zero

<xgboost.core.Booster at 0x20a67085dc0>

In [10]:
model_zero.save_model('xgb_model.json')

In [11]:
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgb_model.json')

In [13]:
y_pred = loaded_model.predict(X_test)

In [14]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[41370, 10402,     0,     0,     0,   422],
       [20278, 25278,   911,     0,   145,    90],
       [    0,  2000,  5556,     0,   140,     0],
       [    0,   222,    15,     0,     0,     0],
       [    0,   922,  1349,     0,   245,     0],
       [ 3533,    26,     0,     0,     0,  3298]])

### Task 3
Korzystając z danych stworzonych w zadaniu 1 uruchom poszukiwanie optymalnych parametrów modelu tak jak zostało to zaprezentowane w przykładzie 5. Ta metoda powinna sama wybierać modele obiecujące i trenować je na większej liczbie danych porzucając jednocześnie modele, które nie rokują. Sprawdź jak wyglądają najlepsze wyliczone parametry vs. te użyte w zadaniu 1 i ewentualnie dopasuj próbkę danych jeżeli jej inicjalna wielkość nie pozwala na wykonanie zadania (zwróć uwagę na ilość i wielkość chunków w przykładzie 3 oraz 5, w tym drugim jest ich znacznie więcej, co przyspiesza poszukiwanie optymalnych parametrów).

In [4]:
cov = fetch_covtype()
X, y = cov.data, cov.target

le = LabelEncoder()
y = le.fit_transform(y)

X = da.from_array(X, chunks=(X.shape[0] // 2048, X.shape[1]))
y = da.from_array(y, chunks=(y.shape[0] // 2048,))

In [5]:
params = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [0.5, 0.75, 1]
}

XGBClassifier nie działa z IncrementalSearchCV? "The estimator must implement partial_fit, set_params, and work well with clone."

In [6]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

with LocalCluster(n_workers=4, threads_per_worker=2, memory_limit='2GB') as cluster:
    display(cluster)
    with cluster.get_client() as client:
        search = GridSearchCV(model, params, cv=3)
        search.fit(X, y)
        
        print("Best params:", search.best_params_)

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 7.45 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:51756,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 7.45 GiB

0,1
Comm: tcp://127.0.0.1:51775,Total threads: 2
Dashboard: http://127.0.0.1:51780/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:51759,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-3omhb161,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-3omhb161

0,1
Comm: tcp://127.0.0.1:51776,Total threads: 2
Dashboard: http://127.0.0.1:51781/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:51761,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-1h0pusrg,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-1h0pusrg

0,1
Comm: tcp://127.0.0.1:51777,Total threads: 2
Dashboard: http://127.0.0.1:51779/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:51763,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-_t0tir02,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-_t0tir02

0,1
Comm: tcp://127.0.0.1:51778,Total threads: 2
Dashboard: http://127.0.0.1:51785/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:51765,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-5xl9x0hz,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-5xl9x0hz


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


KeyboardInterrupt: 