In [2]:
# Boosting

# xgBoost (Extreme Gradient Boosting)

# xgB-Classifier

!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-macosx_10_15_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-macosx_10_15_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.0.4


In [4]:
import pandas as pd

df = pd.read_csv('Travel.csv')

In [5]:
X = df.drop(['ProdTaken','CustomerID'],axis=1)
y = df['ProdTaken']

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10,stratify=y)

In [7]:
cat_feats = X_train.select_dtypes(include='object').columns
num_feats = X_train.select_dtypes(include=['int','float']).columns

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_transformer, num_feats),
        ('cat', cat_transformer, cat_feats)
    ])

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('XGB-Class', XGBClassifier(random_state=42))
])

model_pipeline.fit(X_train,y_train)

In [9]:
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:,1]

In [10]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve

accuracy = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
report = classification_report(y_test,y_pred)

auc_score = roc_auc_score(y_test,y_proba)

print(f'accuracy_score = {accuracy}')
print(f'confusion_matrix = {cm}')
print(f'classification report =\n {report}')

accuracy_score = 0.929107021131561
confusion_matrix = [[1172   19]
 [  85  191]]
classification report =
               precision    recall  f1-score   support

           0       0.93      0.98      0.96      1191
           1       0.91      0.69      0.79       276

    accuracy                           0.93      1467
   macro avg       0.92      0.84      0.87      1467
weighted avg       0.93      0.93      0.93      1467



In [None]:
import warnings as w;w.filterwarnings('ignore')

from sklearn.model_selection import RandomizedSearchCV

params = {
    'XGB-Class__learning_rate':[0.1,0.01],
    'XGB-Class__colsample_bytree':[0.5,0.8,1,0.4],
    'XGB-Class__n_estimators':[100,200,300],
    'XGB-Class__max_depth':[5,8,12,15,None,10]
}

randomcv = RandomizedSearchCV(model_pipeline,param_distributions=params,cv=3,n_iter=100)

randomcv.fit(X_train,y_train)

y_pred_ = randomcv.predict(X_test)
y_proba_ = randomcv.predict_proba(X_test)[:,1]

accuracy = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
report = classification_report(y_test,y_pred)

print(f'accuracy_score = {accuracy}')
print(f'confusion_matrix = {cm}')
print(f'classification report =\n {report}')

In [None]:
import matplotlib.pyplot as plt

auc_score = roc_auc_score(y_test,y_proba_)
fpr,tpr,threshold = roc_curve (y_test,y_proba_)

plt.plot(fpr,tpr,label=f'auc = {auc_score:.3f}')
plt.plot([0,1],[0,1],'y--')
plt.legend()
plt.show()