In [1]:
import dask
import dask.array as da
import joblib
from dask_ml.model_selection import IncrementalSearchCV
from dask_ml.wrappers import Incremental
from sklearn.linear_model import SGDClassifier
from dask_ml.datasets import make_classification
from dask_ml.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from dask.distributed import Client, LocalCluster

In [2]:
# Uruchomienie klastra Dask
cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit="8GB", dashboard_address=':8234')
client = Client(cluster)

# Zadanie 1:
n_samples, n_features = 1000000, 500
X, y = make_classification(n_samples=n_samples, n_features=n_features, chunks=n_samples // 64, flip_y=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y)
classes = da.unique(y_train).compute()

est = SGDClassifier(loss='log_loss', penalty='l2', tol=1e-3)
inc = Incremental(est, scoring='accuracy')
inc.fit(X_train, y_train, classes=classes)
train_score = inc.score(X_train, y_train)
test_score = inc.score(X_test, y_test)

print(f"Train Score: {train_score}, Test Score: {test_score}")



Train Score: 0.5150149783103399, Test Score: 0.5156649872040947


In [3]:
# Zadanie 2

# Funkcje do serializacji i wczytywania modelu
def save_model(model, path):
    joblib.dump(model, path)

def load_model(path):
    return joblib.load(path)

# Zapis modelu na dysk
save_model(inc, "incremental_model.joblib")

# Wczytanie modelu i wykonanie predykcji
model = load_model("incremental_model.joblib")
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test.compute(), y_pred.compute())
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[25116 24789]
 [23660 26467]]


In [4]:
import numpy as np
# Zadanie 3: Poszukiwanie optymalnych parametrów

# Definicja parametrów do wyszukiwania
params = {
    'alpha': np.logspace(-2, 1, num=1000),
    'l1_ratio': np.linspace(0, 1, num=1000),
    'average': [True, False]
}

# Konfiguracja IncrementalSearchCV
search = IncrementalSearchCV(SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0), params, random_state=0, n_initial_parameters=1000)
search.fit(X_train, y_train, classes=classes)

print("Best Score:", search.best_score_)
print("Best Parameters:", search.best_params_)

# Zamknięcie klienta Dask po zakończeniu pracy
client.close()


    * Use InverseDecaySearchCV to use `decay_rate`
    * Specify decay_rate=None


  warn(


Best Score: 0.5730113636363636
Best Parameters: {'l1_ratio': 0.3983983983983984, 'average': False, 'alpha': 0.03447764054734464}
