In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('Churn_Modelling.csv')

In [3]:
x = df.iloc[:,3:13].values
y = df.iloc[:, 13].values

In [4]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
x = ct.fit_transform(x)

In [5]:
x[:7]

array([[1.0, 0.0, 0.0, 619, 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
       [0.0, 0.0, 1.0, 608, 'Female', 41, 1, 83807.86, 1, 0, 1,
        112542.58],
       [1.0, 0.0, 0.0, 502, 'Female', 42, 8, 159660.8, 3, 1, 0,
        113931.57],
       [1.0, 0.0, 0.0, 699, 'Female', 39, 1, 0.0, 2, 0, 0, 93826.63],
       [0.0, 0.0, 1.0, 850, 'Female', 43, 2, 125510.82, 1, 1, 1, 79084.1],
       [0.0, 0.0, 1.0, 645, 'Male', 44, 8, 113755.78, 2, 1, 0, 149756.71],
       [1.0, 0.0, 0.0, 822, 'Male', 50, 7, 0.0, 2, 1, 1, 10062.8]],
      dtype=object)

In [6]:
labelencoder_x_2=LabelEncoder()
x[:,4]=labelencoder_x_2.fit_transform(x[:,4])

In [7]:
x[:7]

array([[1.0, 0.0, 0.0, 619, 0, 42, 2, 0.0, 1, 1, 1, 101348.88],
       [0.0, 0.0, 1.0, 608, 0, 41, 1, 83807.86, 1, 0, 1, 112542.58],
       [1.0, 0.0, 0.0, 502, 0, 42, 8, 159660.8, 3, 1, 0, 113931.57],
       [1.0, 0.0, 0.0, 699, 0, 39, 1, 0.0, 2, 0, 0, 93826.63],
       [0.0, 0.0, 1.0, 850, 0, 43, 2, 125510.82, 1, 1, 1, 79084.1],
       [0.0, 0.0, 1.0, 645, 1, 44, 8, 113755.78, 2, 1, 0, 149756.71],
       [1.0, 0.0, 0.0, 822, 1, 50, 7, 0.0, 2, 1, 1, 10062.8]],
      dtype=object)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

Applying XGBoost to training test

In [9]:
classifier = XGBClassifier() # you can add parameters to the model, for now lets just go ahead

In [10]:
classifier.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [11]:
y_prediction = classifier.predict(x_test)

In [12]:
cm = confusion_matrix(y_test, y_prediction)
cm

array([[1497,   98],
       [ 196,  209]], dtype=int64)

applying k-fold validation

In [13]:
accuracies = cross_val_score(estimator=classifier, X = x_train, y = y_train, cv = 10)
accuracies

array([0.87   , 0.83125, 0.8725 , 0.8425 , 0.845  , 0.8475 , 0.86875,
       0.83875, 0.84625, 0.8625 ])

In [14]:
accuracies.mean()

0.8525

In [15]:
accuracies.std()

0.013896942109687303