# **Customer Churn Prediction Using `XGBoost`**




## **Importing Dependencies**

In [1]:
# import the dependemcies
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

## **Creating Dataframe**




In [3]:
# Upload file from local machine to Google Colab

from google.colab import files
uploaded = files.upload()

Saving data1.csv to data1.csv


In [5]:
# Create dataframe
df = pd.read_csv("data1.csv", sep = ";")

In [6]:
df.head()

Unnamed: 0,account length,location code,user id,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn
0,128,415,3824657,no,yes,25,265,45,17,110,197,87,2447,91,1101,10,3,27,1,0
1,107,415,3717191,no,yes,26,162,27,17,123,196,103,2544,103,1145,137,3,37,1,0
2,137,415,3581921,no,no,0,243,41,10,114,121,110,1626,104,732,122,5,329,0,0
3,84,408,3759999,yes,no,0,299,51,5,71,62,88,1969,89,886,66,7,178,2,0
4,75,415,3306626,yes,no,0,167,28,13,113,148,122,1869,121,841,101,3,273,3,0


## **Preparing the Dataset**

In [7]:
# Check the shape of the dataframe
df.shape

(3333, 20)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   account length                       3333 non-null   int64 
 1   location code                        3333 non-null   int64 
 2   user id                              3333 non-null   int64 
 3   credit card info save                3333 non-null   object
 4   push status                          3333 non-null   object
 5   add to wishlist                      3333 non-null   int64 
 6   desktop sessions                     3333 non-null   int64 
 7   app sessions                         3333 non-null   int64 
 8   desktop transactions                 3333 non-null   int64 
 9   total product detail views           3333 non-null   int64 
 10  session duration                     3333 non-null   int64 
 11  promotion clicks                     3333 n

In [9]:
# Checking the unique values
df["location code"].unique()

array([415, 408, 510])

In [10]:
# Changing the data type of the location code column
df["location code"] = df["location code"].astype(str)

We can also see that two columns- `credit card info save` and `push status` have `yes` and `no`, we can treat these values as categorical values, but let's change the values to `1` and `0`, instead.

In [11]:
df["credit card info save"].unique()

array(['no', 'yes'], dtype=object)

In [12]:
df["credit card info save"].unique()

array(['no', 'yes'], dtype=object)

In [13]:
# Changing the values from yes and no to 1 and 0
df["credit card info save"] = df["credit card info save"].replace({"yes": 1, "no": 0})
df["push status"] = df["push status"].replace({"yes": 1, "no": 0})

In [14]:
# Handling the four columns with string values containing commas in it
df["avg order value"] = df["avg order value"].str.replace(',','.').astype(float)
df["discount rate per visited products"] = df["discount rate per visited products"].str.replace(',','.').astype(float)
df["product detail view per app session"] = df["product detail view per app session"].str.replace(',','.').astype(float)
df["add to cart per session"] = df["add to cart per session"].str.replace(',','.').astype(float)

Now, we handled the categorical values for the column `location code` using `get_dummies` function from `pandas`

In [15]:
# Handling the categorical values using get_dummies
df = pd.get_dummies(df, columns = ["location code"])

The column `user id` is irrelevant. So we better drop it!

In [16]:
# Dropping the column user id
df = df.drop("user id", axis = 1)

In [17]:
df.head()

Unnamed: 0,account length,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,...,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn,location code_408,location code_415,location code_510
0,128,0,1,25,265,45,17,110,197,87,...,91,11.01,10.0,3,2.7,1,0,False,True,False
1,107,0,1,26,162,27,17,123,196,103,...,103,11.45,13.7,3,3.7,1,0,False,True,False
2,137,0,0,0,243,41,10,114,121,110,...,104,7.32,12.2,5,3.29,0,0,False,True,False
3,84,1,0,0,299,51,5,71,62,88,...,89,8.86,6.6,7,1.78,2,0,True,False,False
4,75,1,0,0,167,28,13,113,148,122,...,121,8.41,10.1,3,2.73,3,0,False,True,False


In [18]:
# Check the column names
df.columns

Index(['account length', 'credit card info save', 'push status',
       'add to wishlist', 'desktop sessions', 'app sessions',
       'desktop transactions', 'total product detail views',
       'session duration', 'promotion clicks', 'avg order value',
       'sale product views', 'discount rate per visited products',
       'product detail view per app session', 'app transactions',
       'add to cart per session', 'customer service calls', 'churn',
       'location code_408', 'location code_415', 'location code_510'],
      dtype='object')

In [19]:
# Normalizing some column values
cols_to_scale = ['account length',
       'add to wishlist', 'desktop sessions', 'app sessions',
       'desktop transactions', 'total product detail views',
       'session duration', 'promotion clicks', 'avg order value',
       'sale product views', 'discount rate per visited products',
       'product detail view per app session', 'app transactions',
       'add to cart per session', 'customer service calls']
scaler = Normalizer()
scaled_data =scaler.fit_transform(df[cols_to_scale])
scaled_df = pd.DataFrame(scaled_data, index = df.index, columns = cols_to_scale)

In [20]:
scaled_df.head()

Unnamed: 0,account length,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls
0,0.275142,0.053739,0.569631,0.09673,0.036542,0.236451,0.423461,0.187011,0.525995,0.195609,0.023667,0.021496,0.006449,0.005804,0.00215
1,0.252755,0.061417,0.382676,0.063779,0.040157,0.290551,0.462991,0.243307,0.600944,0.243307,0.027047,0.032362,0.007087,0.00874,0.002362
2,0.345945,0.0,0.613611,0.103531,0.025251,0.287867,0.305543,0.277766,0.410589,0.262615,0.018484,0.030807,0.012626,0.008308,0.0
3,0.208327,0.0,0.741543,0.126484,0.0124,0.176086,0.153765,0.218247,0.488327,0.220727,0.021973,0.016369,0.017361,0.004415,0.00496
4,0.205041,0.0,0.456559,0.076549,0.035541,0.308929,0.404615,0.333534,0.510963,0.3308,0.022992,0.027612,0.008202,0.007464,0.008202


In [21]:
# Merge the normalized dataframe with the original dataframe
df = df.drop(cols_to_scale, axis = 1)
df = pd.merge(df, scaled_df, left_index=True, right_index=True)

In [22]:
# Train-test splitting
X = df.drop("churn", axis=1)
y = df["churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
# Printing the shapes of X_train, X_test, y_train and y_test
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2233, 20) (1100, 20) (2233,) (1100,)


Now that we have our data prepared for training. Let's build our model.

## **Building Our Model**



In [24]:
# Build the model
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)

We predicted `y_test` from the model running on `X_test`

In [25]:
# Predicting on the test data (X_test)
preds = xgb_cl.predict(X_test)

In [26]:
# Print the model accuracy
acc = accuracy_score(y_test, preds)
print("Model Accuracy for Test Dataset:", acc)

Model Accuracy for Test Dataset: 0.92


## **Hyper-parameter Tuning**


In [27]:
# hyperparameter tuning
param_grid = {
    "max_depth": [5],
    "learning_rate": [0, 0.01, 0.05, 0.1],
    "gamma": [1, 5, 10],
    "scale_pos_weight": [2, 5, 10, 20],
    "subsample":[1],
    "colsample_bytree": [1]
}
xgb_cl2 = xgb.XGBClassifier(objective="binary:logistic")
grid_cv = GridSearchCV(xgb_cl2, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")
_ = grid_cv.fit(X_train, y_train)
print("The Best Score:", grid_cv.best_score_)
print("The Best Params:", grid_cv.best_params_)

The Best Score: 0.8758742590553279
The Best Params: {'colsample_bytree': 1, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 5, 'scale_pos_weight': 2, 'subsample': 1}


In [28]:
final_cl = xgb.XGBClassifier(
    **grid_cv.best_params_, objective="binary:logistic"
)
grid_final = final_cl.fit(X_train, y_train)
preds = grid_final.predict(X_test)
acc = accuracy_score(y_test, preds)
print("Accuracy of the Final Model:", acc)

Accuracy of the Final Model: 0.9254545454545454
