In [1]:
# 1. GradientBoostingClassifier

In [15]:
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [16]:
x, y = make_classification(n_samples=1000, n_features = 20, n_informative=15, n_redundant=5, random_state=7)

In [17]:
model = GradientBoostingClassifier()

In [18]:
param_grid = {
    'n_estimators' : [10, 50, 100, 500],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1.0],
    'subsample' : [0.5, 0.7, 1.0],
    'max_depth' : [3, 7, 9]
}

In [19]:
cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=1)

In [20]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs =-1, cv=cv, scoring='accuracy')

In [21]:
grid_result = grid_search.fit(x,y)

In [22]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.944667 using {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}


------------------

# Using XGBoost

In [29]:
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [30]:
x ,y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=42)

In [31]:
dtrain = xgb.DMatrix(x_train,label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [32]:
params = {
    'objective' : 'binary:logistic',
    'max_depth' : 3,
    'learning_rate': 0.1,
    'n_estimators' : 100
}

In [33]:
model = xgb.train(params, dtrain, num_boost_round=100)

Parameters: { "n_estimators" } are not used.



In [41]:
y_pred_proba = model.predict(dtest)  # Get predicted probabilities
y_pred = (y_pred_proba >= 0.5).astype(int) 

In [42]:
from sklearn.metrics import accuracy_score

In [44]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%", accuracy * 100)

Accuracy: %.2f%% 89.33333333333333


---------------------

# Using `LightGBM`
`LightGBM` is designed for efficiency and speed.

In [69]:
import lightgbm as lgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [70]:
x, y = make_classification(n_samples=10000, n_features=20,n_informative=15,n_redundant=5, random_state=7)

In [71]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=7, test_size=0.3)

In [72]:
ltrain= lgb.Dataset(x_train, label=y_train)
ltest = lgb.Dataset(x_test, label=y_test)

In [73]:
params = {
    'objective' : 'binary',
    'metrics' : 'binary_logloss',
    'learning_rate' : 0.1,
    'num_leaves' : 31,
    'n_estimators' : 100
}

In [74]:
model = lgb.train(params, ltrain, num_boost_round=100)



[LightGBM] [Info] Number of positive: 3513, number of negative: 3487
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501857 -> initscore=0.007429
[LightGBM] [Info] Start training from score 0.007429


In [78]:
y_pred_prob = model. predict(x_test)

In [79]:
y_pred = (y_pred_prob >= 0.5).astype(int)


In [80]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9816666666666667

________________

# CatBoost

In [82]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting plotly (from catboost)
  Downloading plotly-5.23.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tenacity>=6.2.0 (from plotly->catboost)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp312-cp312-manylinux2014_x86_64.whl (98.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.1/98.1 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading plotly-5.23.0-py3-none-any.whl (17.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[?25hDownloading tenacity-9.0.0-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, plotly, catboost
Successfully installed catboost-1.2.5 plotly-5.23.0 tenacity-9.0.0

[1m[[0m[34;49mnotice[0m[1

In [83]:
import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [86]:
data = load_breast_cancer()
x,y = data.data, data.target

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.3, random_state=42)

In [88]:
model = cb.CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3)

In [89]:
model.fit(x_train, y_train)

0:	learn: 0.5710005	total: 50.2ms	remaining: 4.97s
1:	learn: 0.4675621	total: 52.2ms	remaining: 2.56s
2:	learn: 0.4050141	total: 53.4ms	remaining: 1.73s
3:	learn: 0.3480606	total: 54.4ms	remaining: 1.3s
4:	learn: 0.2935011	total: 55.4ms	remaining: 1.05s
5:	learn: 0.2611596	total: 56.9ms	remaining: 892ms
6:	learn: 0.2213728	total: 58ms	remaining: 771ms
7:	learn: 0.2040785	total: 59.2ms	remaining: 681ms
8:	learn: 0.1936829	total: 60.2ms	remaining: 609ms
9:	learn: 0.1749248	total: 61.6ms	remaining: 554ms
10:	learn: 0.1539593	total: 62.8ms	remaining: 508ms
11:	learn: 0.1404436	total: 64.1ms	remaining: 470ms
12:	learn: 0.1277966	total: 65.2ms	remaining: 436ms
13:	learn: 0.1178603	total: 66.2ms	remaining: 407ms
14:	learn: 0.1104187	total: 67.2ms	remaining: 381ms
15:	learn: 0.1052006	total: 68.4ms	remaining: 359ms
16:	learn: 0.0991373	total: 69.4ms	remaining: 339ms
17:	learn: 0.0913972	total: 70.9ms	remaining: 323ms
18:	learn: 0.0847366	total: 72ms	remaining: 307ms
19:	learn: 0.0812191	total:

<catboost.core.CatBoostClassifier at 0x759c39ff9e20>

In [90]:
y_pred = model.predict(x_test)

In [92]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9649122807017544

--------------------------------

# Dask-XGBoost

In [98]:
!pip install "dask[distributed]" --upgrade

Collecting distributed==2024.7.1 (from dask[distributed])
  Downloading distributed-2024.7.1-py3-none-any.whl.metadata (3.4 kB)
Collecting msgpack>=1.0.0 (from distributed==2024.7.1->dask[distributed])
  Downloading msgpack-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting sortedcontainers>=2.0.5 (from distributed==2024.7.1->dask[distributed])
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting tblib>=1.6.0 (from distributed==2024.7.1->dask[distributed])
  Downloading tblib-3.0.0-py3-none-any.whl.metadata (25 kB)
Collecting zict>=3.0.0 (from distributed==2024.7.1->dask[distributed])
  Downloading zict-3.0.0-py2.py3-none-any.whl.metadata (899 bytes)
Downloading distributed-2024.7.1-py3-none-any.whl (1.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hDownloading msgpack-1.0.8-cp312-cp312-ma

In [99]:
import xgboost as xgb
import dask.dataframe as dd
from dask.distributed import Client

In [100]:
df = dd.read_csv('dataset/stroke/healthcare-dataset-stroke-data.csv')

In [101]:
x = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [129]:
# Split the data
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Create a Dask DMatrix
dtrain = xgb.dask.DaskDMatrix(client, X, y)

# Train the model
output = xgb.dask.train(client, {'objective': 'binary:logistic'}, dtrain, num_boost_round=100)

# Get the best model
booster = output['booster']

NameError: name 'client' is not defined

----------------

# Using Optuna For hyperparameter tuning

In [105]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.31-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Downloading greenlet-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [38;2;114;156;31m━━━

In [106]:
import optuna
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [109]:
data = load_breast_cancer()
x, y = data.data, data.target
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=4)

In [118]:
# Defining an objective function for optuna
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 2, 10)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)

    model = xgb.XGBClassifier(max_depth=max_depth,n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
    model.fit(x_train,y_train)
    score = model.score(x_test,y_test)
    return score

In [119]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-07-31 05:09:12,845] A new study created in memory with name: no-name-5ae19c7d-fa7a-408a-8a54-8f074f301a50
[I 2024-07-31 05:09:12,978] Trial 0 finished with value: 0.9473684210526315 and parameters: {'max_depth': 5, 'n_estimators': 395, 'learning_rate': 0.16396328789814654}. Best is trial 0 with value: 0.9473684210526315.
[I 2024-07-31 05:09:13,057] Trial 1 finished with value: 0.9649122807017544 and parameters: {'max_depth': 5, 'n_estimators': 385, 'learning_rate': 0.2676136338546822}. Best is trial 1 with value: 0.9649122807017544.
[I 2024-07-31 05:09:13,138] Trial 2 finished with value: 0.9532163742690059 and parameters: {'max_depth': 8, 'n_estimators': 433, 'learning_rate': 0.28220574302398915}. Best is trial 1 with value: 0.9649122807017544.
[I 2024-07-31 05:09:13,226] Trial 3 finished with value: 0.9473684210526315 and parameters: {'max_depth': 7, 'n_estimators': 346, 'learning_rate': 0.13780112701763061}. Best is trial 1 with value: 0.9649122807017544.
[I 2024-07-31 05:09

In [123]:
best_params = study.best_params
best_params

{'max_depth': 5, 'n_estimators': 385, 'learning_rate': 0.2676136338546822}

In [125]:
y_pred = model.predict(x_test)

In [128]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9824561403508771