# XGBoost + GridSearchCV

In [18]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

### Dataset

In [3]:
df = pd.read_csv("data.csv")
X = df.drop(columns=["id", "Unnamed: 32", "diagnosis"])
y = df["diagnosis"].map({'B': 0, 'M': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8)

In [9]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

### Model + Grid Search CV

In [4]:
model = xgb.XGBClassifier()

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA()), 
    ('model', model)
])

param_grid = {
    'pca__n_components': [5, 10, 15, 20, 25, 30],
    'model__max_depth': [2, 3, 5, 7, 10],
    'model__n_estimators': [10, 100, 500],
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

### Training

In [5]:
%%time

grid.fit(X_train, y_train)

CPU times: user 2min 37s, sys: 1.01 s, total: 2min 38s
Wall time: 1min


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standard_scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('model',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                             

### Evaluation

In [6]:
mean_score = grid.cv_results_["mean_test_score"][grid.best_index_]
std_score = grid.cv_results_["std_test_score"][grid.best_index_]

grid.best_params_, mean_score, std_score

print(f"Best parameters: {grid.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")

Best parameters: {'model__max_depth': 2, 'model__n_estimators': 500, 'pca__n_components': 15}
Mean CV score:  0.988988
Standard deviation of CV score:  0.009521


In [12]:
model = xgb.XGBClassifier(**grid.best_params_)

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA()), 
    ('model', model)
])

In [19]:
pipeline.fit(X_train, y_train)
y_pred=pipeline.predict(X_test)
f1_score=f1_score(np.array(y_test),y_pred)
f1_score

Parameters: { "model__max_depth", "model__n_estimators", "pca__n_components" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




0.9428571428571428

In [20]:
y_proba = pipeline.predict_proba(X_test)[:,1]
roc_auc_score=roc_auc_score(np.array(y_test),y_proba)
roc_auc_score

0.987360057782593