# Scaling scikit‑learn with Dask‑ML + joblib

This notebook scales the previous example with **Dask-ML** and `joblib`'s Dask backend.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from dask.distributed import Client
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LinearRegression as DaskLinearRegression
from joblib import parallel_backend

# Local client (works on single machine; you can also connect to a remote scheduler)
client = Client()
print(client)

# synthetic data (same pattern)
rng = np.random.default_rng(0)
X = (50 * rng.random((100000, 1))).astype(np.float32)
y = (0.3 * X + 1.0 + rng.normal(0, 1, size=X.shape)).astype(np.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = DaskLinearRegression()

with parallel_backend("dask"):
    model.fit(X_train, y_train)

print("R2 train:", model.score(X_train, y_train))
print("R2 test:", model.score(X_test, y_test))

# Plot a small sample for visualization
idx = np.random.choice(len(X_test), size=200, replace=False)
X_s, y_s = X_test[idx], y_test[idx]
y_hat = model.predict(X_s)

plt.figure()
plt.scatter(X_s, y_s, s=12)
plt.scatter(X_s, y_hat, s=12)
plt.title("Dask-ML Linear Regression (sampled view)")
plt.show()