In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed (Hidden Input/Output)
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div class="alert alert-block alert-info" >
    <h1 style="text-align:center;font-weight: 20px; color:black;">
        Will we be able to predict whether the Customer will Churn?<br></br> Let's find out... </h1>
</div>

<!-- # <p style="background-color:Peachpuff;font-family:Helvetica;color:Black;font-size:110%;text-align:center;border-radius:20px 20px;font-weight:bold;">Will we be able to predict whether the Customer will Churn?😎😁🔮 Let's find out...</p> -->



<img src=https://4.bp.blogspot.com/-wzReF0exOt8/WddYCM4OdYI/AAAAAAAAELg/vFEwRbLUCn8bBwVpl-vUbp6NZw_OaraTQCEwYBhgL/s640/churn.png alt = "customer churn" style="display: block; margin-left: auto; margin-right: auto; width: 80%;"> </img>

 <p style="text-align:center; font-size:150%"><i><strong> “Satisfaction is a rating. Loyalty is a brand.” </i>– Shep Hyken </strong></p>

<div class="alert alert-block alert-info">
    <h1 style="text-align:center;font-weight: bold; color:black;">
        Table of Contents </h1>
</div>

<li style="font-size:150%"><a href="#1"> Initial Data Cleaning </a></li>

<li style="font-size:150%"><a href="#2"> Exploratory Data Analysis(EDA) </a></li>

<li style="font-size:150%"><a href="#3"> Visualizing Categorical Variables </a></li>

<li style="font-size:150%"><a href="#4"> Visualizing Numerical Variables </a></li>

<li style="font-size:150%"><a href="#5"> Preparing the data before Modelling </a></li>

<li style="font-size:150%"><a href="#6"> Building the Model using XGBoost </a></li>

<li style="font-size:150%"><a href="#7"> Evaluating the Model </a>
    
<li style="font-size:150%"><a href="#8"> Finding the important Features </a></li>

<li style="font-size:150%"><a href="#9"> Conclusion </a></li>

<li style="font-size:150%"><a href="#12"> My Other Notebooks </a></li>

<li style="font-size:150%"><a href="#13"> Credits </a></li>


<div class="alert alert-block alert-info">
<a id="1">
<h1 style="text-align:center;font-weight: bold; color:black;" > Initial Data Cleaning  </h1>
</a>
</div>

In [None]:
# Importing all the necessary libraries (Hidden Input)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, plot_confusion_matrix, roc_curve
import scikitplot as skplt
import warnings
warnings.filterwarnings("ignore")
plt.style.use('seaborn-muted')
%matplotlib inline

In [None]:
# Importing the data and looking at the first ten rows (Hidden Input)

df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
df.head(10)

In [None]:
# Checking the number of rows and columns

df.shape

In [None]:
# Dropping the last two columns and the client number column

df = df.iloc[:,1:-2]

In [None]:
# Checking for the number of null values present in each feature

round((df.isnull().sum() * 100)/(df.shape[0]),2)

- There are no null values

In [None]:
# Checking the Descriptive statistics of the numerical columns (Hidden Input)

df.describe().T.style.bar(
    subset=['mean'],
    color='lightsalmon').background_gradient(
    subset=['std'], cmap='plasma').background_gradient(subset=['75%'], cmap='plasma').background_gradient(
    subset=['max'], cmap='plasma')

In [None]:
df.info()

In [None]:
# Changing the Attrition_Flag to 0 and 1 (Hidden Input/Output)

df["Attrition_Flag"] = df["Attrition_Flag"].map({"Existing Customer":0, "Attrited Customer":1})

<div class="alert alert-block alert-info">
<a id="2">
<h1 style="text-align:center;font-weight: bold; color:black;" > Exploratory Data Analysis </h1>
<a id="2">
 </div>

<div class="alert alert-block alert-info">
<a id="3">
<h2 style="text-align:center; color:black;" >Visualizing Categorical Features</h2>
<a id="3">
 </div>

In [None]:
# Performing EDA on the Categorical columns (Hidden Input)

num_cols = list(df.select_dtypes(["int64","float64"]))
cat_cols = list(df.select_dtypes("object"))

fig, ax = plt.subplots(ncols=1, nrows=5, figsize=(16, 28))
                
i = 0 #counter

for cols in cat_cols:
    
    sns.barplot(x=df[cols], y=df['Attrition_Flag'], fill=True, alpha=1, ci=None, ax=ax[i], palette=('#05386b', '#379683','#5cdb59',
                                                                                                     '#8ee4af','#edf5e1'))
                
    ax[i].set_xlabel(' ')
    ax[i].set_xlabel(' ')
    ax[i].set_ylabel(' ')
    ax[i].xaxis.set_tick_params(labelsize=14)
    ax[i].tick_params(left=False, labelleft=False)
    ax[i].set_ylabel(cols, fontsize=16)    
    ax[i].bar_label(ax[i].containers[0], size="12")
    i=i+1
    
      
plt.show()

<div class="alert alert-block alert-info">
<a id="4">
<h2 style="text-align:center; color:black;" > Visualizing Numerical Features </h2>
<a id="4">
 </div>

In [None]:
# Visualizing the Numerical Columns (Hidden Input) and treating outliers

fig, ax = plt.subplots(ncols=2, nrows=14, figsize=(16, 40))
                
i = 0 #counter
for cols in num_cols[1:]:
    
    # Removing Outliers     
    Q3 = df[cols].quantile(0.99)
    df = df[df[cols] <= Q3]
    Q1 = df[cols].quantile(0.01)
    df = df[df[cols] >= Q1]
    
    sns.kdeplot(df[cols], fill=True, alpha=1, hue = df['Attrition_Flag'], 
                palette=('#1f2833','#66fcf1'), multiple='stack', ax=ax[i,0])
    
    sns.boxplot(data= df, y=cols, x='Attrition_Flag', ax=ax[i, 1],
               palette=('#1f2833','#45a29e'), color='white')
    ax[i,0].set_xlabel(' ')
    ax[i,1].set_xlabel(' ')
    ax[i,1].set_ylabel(' ')
    ax[i,1].xaxis.set_tick_params(labelsize=14)
    ax[i,0].tick_params(left=False, labelleft=False)
    ax[i,0].set_ylabel(cols, fontsize=12)
    i=i+1
      
plt.show()

In [None]:
# Creating a heatmap of the numerical column (Hidden Input)

heat = df.corr()
plt.figure(figsize=[16,8])
plt.title("Correlation between all the Numerical Features", size=25, pad=20, color='#1f2833')
sns.heatmap(heat, cmap=['#0b0c10', '#1f2833','#c5c6c7','#45a29e','#66fcf1'], annot=True)
plt.show()

In [None]:
# Checking the imbalance in the target variable: Attrition_Flag (Hidden Input)

fig = px.pie(df, values=df["Attrition_Flag"].value_counts(), color=df["Attrition_Flag"].unique() ,
             color_discrete_map={0:"#1f2833", 1:"#66fcf1"},
             title='Imbalance in Target Feature', names=df["Attrition_Flag"].unique())
fig.show()

- This Feature is Imbalanced, hence, we need to fix this.

<div class="alert alert-block alert-info">
 <a id="5">
<h1 style="text-align:center;font-weight: bold; color:black;"> Preparing the Data before Model Building</h1>
 </a>
</div>

In [None]:
# Creating the dummy variables for all the categorical features

for col in cat_cols:
    dummy_cols = pd.get_dummies(df[col], drop_first=True, prefix=col)
    df = pd.concat([df,dummy_cols],axis=1)
    df.drop(columns=col, inplace=True)

In [None]:
# Splitting the data into train and test

y = df.pop("Attrition_Flag")
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=69)

In [None]:
# Normalizing the data
req_cols = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count',
            'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
            'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1',
            'Avg_Utilization_Ratio']

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train[req_cols])

X_test = scaler.transform(X_test[req_cols])

In [None]:
# Applying SMOTE to handle imbalance in target variable

sm = SMOTE(random_state = 69, sampling_strategy = 1.0)

X_train, y_train = sm.fit_resample(X_train, y_train)

<div class="alert alert-block alert-info">
 <a id="6">
<h1 style="text-align:center;font-weight: bold; color:black;"> Building the Model </h1>
 </a>
</div>

<div class="alert alert-block alert-success">
<p style="color:black; font-size:18px">🎯 For identifying the Customer churn, we are going to use <strong>Extreme Gradient Boosting technique (XGBoost)</strong><br></br>
🎯 <strong>XGBoost</strong> is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.
<br></br>
🎯 Read more about the same: <a>https://xgboost.readthedocs.io/en/latest/</a>
</p>
</div>

<img src=https://cdn.educba.com/academy/wp-content/uploads/2019/06/XGBoost-Algorithm1.jpg alt = "credit card image" style="display: block; margin-left: auto; margin-right: auto; width: 60%;"> </img>


<div class="alert alert-block alert-success">
<p style="color:black; font-size:18px">🎯 In the near future, I will develop this notebook further to add other good classification Models.
</p>
</div>

In [None]:
# XGBoost model

xgb_model = xgb.XGBClassifier(random_state=69, use_label_encoder=False, n_jobs=-1)

xgb_model.fit(X_train, y_train)

<div class="alert alert-block alert-info">
 <a id="7">
<h1 style="text-align:center;font-weight: bold; color:black;"> Evaluating the model </h1>
 </a>
</div>

<div class="alert alert-block alert-success">
<p style="color:black; font-size:18px">🎯 For Evaluating our Model, we are going to make use of the ROC Curve and Confusion Matrix
    <br></br>
⭐ Read More about ROC Curve here: <a>https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/</a>
    <br></br>
⭐ Read More about Confusion Matrix here: <a>https://machinelearningmastery.com/confusion-matrix-machine-learning/</a>
</p>
</div>

In [None]:
# XGBoost Score on the test data

xgb_model.score(X_test, y_test)

In [None]:
# Checking the Classification Report (Hidden Input)

print(classification_report(y_test, xgb_model.predict(X_test)))

<div class="alert alert-block alert-info">
 <a id="10">
<h2 style="text-align:center; color:black;"> Confusion Matrix </h2>
 </a>
</div>

<div class="alert alert-block alert-success">
<p style="color:black; font-size:18px">🎯 <strong>Confusion Matrix</strong> is a tool to determine the performance of classifier. It contains information about actual and predicted classifications. The below table shows confusion matrix of two-class, churned customers and non-churned customers classifier.
    <br></br>
🎯 <strong>True Positive (TP)</strong> is the number of correct predictions that an example is positive which means positive class correctly identified as positive.
Example: Given class is churned and the classifier has been correctly predicted it as churned.
    <br></br>
🎯 <strong>False Negative (FN)</strong> is the number of incorrect predictions that an example is negative which means positive class incorrectly identified as negative.
Example: Given class is churned however, the classifier has been incorrectly predicted it as non-churned.  
    <br></br>
🎯 <strong>False positive (FP)</strong> is the number of incorrect predictions that an example is positive which means negative class incorrectly identified as positive.
Example: Given class is non-churned however, the classifier has been incorrectly predicted it as churned. 
    <br></br>
🎯 <strong>True Negative (TN)</strong> is the number of correct predictions that an example is negative which means negative class correctly identified as negative.
Example: Given class is not churned and the classifier has been correctly predicted it as not negative. 
</p>
</div>

<img src=https://2.bp.blogspot.com/-EvSXDotTOwc/XMfeOGZ-CVI/AAAAAAAAEiE/oePFfvhfOQM11dgRn9FkPxlegCXbgOF4QCLcBGAs/s1600/confusionMatrxiUpdated.jpg alt = "ROC Curve Explained in one picture" style="display: block; margin-left: auto; margin-right: auto; width: 80%;"> </img>

In [None]:
# Creating the Confusion Matrix (Hidden Input)

cfm = confusion_matrix(y_true=y_test, y_pred=xgb_model.predict(X_test))

TP = cfm[0][0]
FN = cfm[0][1]
FP = cfm[1][0]
TN = cfm[1][1]


fig, ax = plt.subplots(figsize=(16, 8))
plt.title("Confusion Matrix and Corresponding Accuracy, Precision and Recall", size=20, pad=20)
plot_confusion_matrix(xgb_model, X_test, y_test, cmap='plasma', ax=ax)
plt.show()

print("*" * 30)
print("Accuracy :", (TP+TN)/(TP+TN+FP+FN))
print("Precision :", (TP)/(TP+FP))
print("Recall :", (TP)/(TP+FN))
print("*" * 30)

<div class="alert alert-block alert-info">
 <a id="11">
<h2 style="text-align:center; color:black;"> ROC Curve </h2>
 </a>
</div>

<div class="alert alert-block alert-success">
<p style="color:black; font-size:18px">🎯 With a <strong>ROC curve</strong>, you're trying to find a good model that optimizes the trade off between the False Positive Rate (FPR) and True Positive Rate (TPR).  What counts here is how much area is under the curve (Area under the Curve = AuC). The ideal curve in the left image fills in 100%, which means that you're going to be able to distinguish between negative results and positive results 100% of the time (which is almost impossible in real life). The further you go to the right, the worse the detection. The ROC curve to the far right does a worse job than chance, mixing up the negatives and positives (which means you likely have an error in your setup). 
</p>
</div>

<img src=https://storage.ning.com/topology/rest/1.0/file/get/1341805045? alt = "ROC Curve Explained in one picture" style="display: block; margin-left: auto; margin-right: auto; width: 100%;"> </img>

In [None]:
# Checking the roc_auc_curve (Hidden Input)

y_true = y_test # ground truth labels
y_probas = xgb_model.predict_proba(X_test) # predicted probabilities generated by sklearn classifier

skplt.metrics.plot_roc(y_true, y_probas, figsize=(16,8), title_fontsize=25, text_fontsize=16, cmap='plasma')
plt.show()

<div class="alert alert-block alert-info">
 <a id="8">
<h1 style="text-align:center;font-weight: bold; color:black;"> Finding the Important Features </h1>
 </a>
</div>

In [None]:
# Plotting the important features (Hidden Input)

features_to_plot = 14

importances = xgb_model.feature_importances_
indices = np.argsort(importances)

best_vars = np.array(req_cols)[indices][-features_to_plot:]
values = importances[indices][-features_to_plot:]
best_vars

y_ticks = np.arange(0, features_to_plot)
fig, ax = plt.subplots(figsize=(16,8))
ax.barh(y_ticks, values, color=['#379683','#5cdb59','#8ee4af','#05386b','#edf5e1'])
ax.set_yticklabels(best_vars, size=12)
ax.set_yticks(y_ticks)
ax.set_title("XGBClassifer Feature Importances", size=25, pad=20, color='black')
ax.bar_label(ax.containers[0], size="10")
fig.tight_layout()
plt.show()

<div class="alert alert-block alert-info">
 <a id="9">
<h1 style="text-align:center;font-weight: bold; color:black;"> Conclusion </h1>
 </a>
</div>

<img src=https://martech.org/wp-content/uploads/2018/08/customer-retention-ss-1920.jpg alt = "credit card image" style="display: block; margin-left: auto; margin-right: auto; width: 60%;"> </img>



<div class="alert alert-success">
<h4>
<li> The Bank can take enough insights from the Exploratory Data Analysis performed to understand which kind of Customers are more likely to leave.</li>
<br></br>
<li> They can use this XGBoost Classifier model to safely predict whether a customer will leave or not.</li>
<br></br>
<li> By looking at the feature importance, the Bank can mitigate further churn by applying the necessary remedial actions.</li>
</h4>
</div>

<div class="alert alert-block alert-info">
 <a id="12">
<h1 style="text-align:center;font-weight: bold; color:black;"> My Other Notebooks </h1>
 </a>
</div>

<div class="alert alert-success">
<p style="font-size:120%;color:black">⭐ Predicting Rider Count PCA + 11 Models Compared (Regression) <a href="https://www.kaggle.com/vivek468/will-the-customer-churn">here</a>.</p>
<p style="font-size:120%;color:black">⭐ Heart Attack Prediction (Classification) <a href="https://www.kaggle.com/vivek468/heartattackprediction-decisiontree">here</a>.</p>
<p style="font-size:120%;color:black">⭐ Boom Bikes (Regression) <a href="https://www.kaggle.com/vivek468/boombikes-lr-r2score83">older notebook</a>.</p>    

<div class="alert alert-block alert-info">
 <a id="13">
<h1 style="text-align:center;font-weight: bold; color:black;"> Credits </h1>
 </a>
</div>

<div class="alert alert-success">
<p style="font-size:120%;color:black">⭐ <a href="https://www.kaggle.com/sakshigoyal7">Sakshi Goyal</a> for this awesome dataset.</p>
<p style="font-size:120%;color:black">⭐ <a href="https://www.kaggle.com/mpwolke">Marilia Prata</a> for constant motivation.</p>
<p style="font-size:120%;color:black">⭐ <a href="https://www.kaggle.com/najeebahmadbhuiyan">Najeeb Ahmed Bhuiyan</a> and his notebooks to learn EDA from.</p>

<h3>If you enjoyed reading my Notebook, please leave your valuable comments behind. And do suggest improvements, if any. 😁👍</h3>