In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from xgboost import XGBRFClassifier

In [2]:
# get the dataset
def get_dataset():
    X, y = make_classification(
        n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7
    )
    return X, y

In [11]:
# get a list of models to evaluate
def get_models(model):
    models = dict()
    for v in arange(0.1, 1.1, 0.1):
        key = "%.1f" % v
        models[key] = model(
            n_estimators=100, subsample=0.9, colsample_bynode=v
        )
    return models

In [4]:
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    # define the model evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate the model
    scores = cross_val_score(model, X, y, scoring="accuracy", cv=cv, n_jobs=-1)
    return scores

In [12]:
from numpy import arange, mean, std

# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models(XGBRFClassifier)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model and collect the results
    scores = evaluate_model(model, X, y)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize performance along the way
    print(">%s %.3f (%.3f)" % (name, mean(scores), std(scores)))

>0.1 0.889 (0.032)
>0.2 0.891 (0.036)
>0.3 0.887 (0.032)
>0.4 0.886 (0.030)
>0.5 0.878 (0.033)
>0.6 0.874 (0.031)
>0.7 0.869 (0.027)
>0.8 0.867 (0.027)
>0.9 0.856 (0.023)
>1.0 0.846 (0.027)


In [9]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
score = cross_val_score(model, X, y, scoring="accuracy", cv=cv, n_jobs=-1)
mean(score)

0.8909999999999999

In [13]:
from numpy import arange, mean, std

# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models(XGBClassifier)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model and collect the results
    scores = evaluate_model(model, X, y)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize performance along the way
    print(">%s %.3f (%.3f)" % (name, mean(scores), std(scores)))

>0.1 0.924 (0.028)
>0.2 0.919 (0.026)
>0.3 0.930 (0.029)
>0.4 0.927 (0.024)
>0.5 0.927 (0.031)
>0.6 0.930 (0.029)
>0.7 0.926 (0.029)
>0.8 0.928 (0.026)
>0.9 0.927 (0.030)
>1.0 0.928 (0.025)


In [14]:
from xgboost import XGBClassifier

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model = XGBClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.3)
score = cross_val_score(model, X, y, scoring="accuracy", cv=cv, n_jobs=-1)
mean(score)

0.9303333333333333

In [16]:
from sklearn.model_selection import GridSearchCV

In [31]:
parametros = {"n_estimators": [10, 50, 100, 150, 200, 300, 400], 
              "subsample": [0.9, 0.8, 0.7],
              'colsample_bynode': arange(0.1, 1.1, 0.1)
             }

In [21]:
modelo_gs = GridSearchCV(XGBClassifier(), param_grid=parametros,
                         cv = cv, scoring='accuracy')
modelo_gs.fit(X, y)

In [33]:
modelo_gs = GridSearchCV(XGBClassifier(), param_grid=parametros,
                         cv = cv, scoring='accuracy', n_jobs=-1, verbose=1)
modelo_gs.fit(X, y)

Fitting 30 folds for each of 210 candidates, totalling 6300 fits


In [34]:
print(modelo_gs.best_params_, "\naccuracy: {}".format(round(modelo_gs.best_score_,2)))

{'colsample_bynode': 0.2, 'n_estimators': 150, 'subsample': 0.8} 
accuracy: 0.93


In [22]:
print(modelo_gs.best_params_, "\naccuracy: {}".format(round(modelo_gs.best_score_,2)))

{'colsample_bynode': 0.2, 'n_estimators': 150, 'subsample': 0.8} 
accuracy: 0.93


In [23]:
from xgboost import XGBClassifier

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model = XGBClassifier(n_estimators=150, subsample=0.8, colsample_bynode=0.2)
score = cross_val_score(model, X, y, scoring="accuracy", cv=cv, n_jobs=-1)
mean(score)

0.934