# Churn use case of a banking company

In [42]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [30]:
# Load the data and plot the top rows of the data set
Data = pd.read_csv("bankingchurn_data_modif.csv", sep = ";", decimal=',')
Data.head()

Unnamed: 0,contract_start,contract_end,date_of_birth,gender,profession,size_household,ZIP,segment,credit_rating,no_credit_rating_flag,...,online_number_of_logins_per_month,online_tranactions_per_month,advisor_contacts_last12months,customer_limit,cash_withdraws_per_month,cash_withdrawals_value,consumer_credit_value,consumer_credit_maturity,account_fee,number_of_refusals
0,2010-10-24,,1991-02-11,M,Professional Services,1,76829,S2,89.568518,0,...,0,0,3,1000.0,4,459.84094,-24011.393552,205,15,6
1,2010-04-30,2017-10-20,1986-04-06,W,,1,35104,S2,94.270507,0,...,16,8,2,1000.0,7,476.334459,-5368.743068,137,15,6
2,2013-05-02,,1993-04-18,M,Transportation,1,63450,S2,93.030947,0,...,26,12,2,1000.0,3,332.640202,-9678.713419,624,15,4
3,2016-12-24,,1994-07-29,W,Professional Services,2,9573,S3,,1,...,0,0,4,1050.0,4,378.324744,-12110.305916,1263,10,7
4,2011-10-01,,1990-10-28,W,Research,1,66916,S2,90.457664,0,...,23,18,3,1000.0,4,471.509299,0.0,521,10,3


In [18]:
# Display the data types of each column
Data.dtypes

contract_start                        object
contract_end                          object
date_of_birth                         object
gender                                object
profession                            object
size_household                         int64
ZIP                                   object
segment                               object
credit_rating                        float64
no_credit_rating_flag                  int64
main_account_flag                      int64
online_banking_flag                    int64
tele_banking_flag                      int64
creditcard_flag                        int64
insurance_life_flag                    int64
insurance_house_flag                   int64
insurance_car_flag                     int64
insurance_other_flag                   int64
mortgage_flag                          int64
portfolio_flag                         int64
last_balance                         float64
last_balance_minus_6_months          float64
last_balan

In [31]:
# Add a new column "Target"
# This column will be used later on to feed the model
Data['Target'] = 0

In [32]:
# Check the new column added to the data set
Data.head().T

Unnamed: 0,0,1,2,3,4
contract_start,2010-10-24,2010-04-30,2013-05-02,2016-12-24,2011-10-01
contract_end,,2017-10-20,,,
date_of_birth,1991-02-11,1986-04-06,1993-04-18,1994-07-29,1990-10-28
gender,M,W,M,W,W
profession,Professional Services,,Transportation,Professional Services,Research
size_household,1,1,1,2,1
ZIP,76829,35104,63450,9573,66916
segment,S2,S2,S2,S3,S2
credit_rating,89.5685,94.2705,93.0309,,90.4577
no_credit_rating_flag,0,0,0,1,0


In [33]:
# Create 0 where 'contract_end' column values is Nan and 1 for the rest of the cases
Data['Target'] = np.where(Data['contract_end'].isna(), 0, 1) 
# Set 'Target' column as the target for the machine learning model
y = Data.Target
# Analyse the dataset and chose the relevant columns to generate the X variable
var_select = ["size_household", "income_deposits_per_year", "main_account_flag", "insurance_house_premium_per_year", "income_securities_per_year", "cash_withdraws_per_month"]
X = Data[var_select]

In [34]:
# Check number of churns present already in the dataset
Data.groupby('Target').size()

Target
0    95129
1     4871
dtype: int64

In [38]:
Data.head().T

Unnamed: 0,0,1,2,3,4
contract_start,2010-10-24,2010-04-30,2013-05-02,2016-12-24,2011-10-01
contract_end,,2017-10-20,,,
date_of_birth,1991-02-11,1986-04-06,1993-04-18,1994-07-29,1990-10-28
gender,M,W,M,W,W
profession,Professional Services,,Transportation,Professional Services,Research
size_household,1,1,1,2,1
ZIP,76829,35104,63450,9573,66916
segment,S2,S2,S2,S3,S2
credit_rating,895685176247469,942705069107966,930309465957283,,9045766439571
no_credit_rating_flag,0,0,0,1,0


In [38]:
# Split the dataset among training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [43]:
# Import the model. Logistic Regression is widely used
# when the target variable is binary
model = LogisticRegression()
result = model.fit(X_train, y_train)
prediction = model.predict(X_test)

# Measure the acurracy of the prediction by comparing 
# with the actual values obtained from the 'Target' column
print(metrics.accuracy_score(y_test, prediction))



0.9513


In [45]:
# Get the weights of the variables used in the logistic 
# regression model, to segment clients and apply 
# specific measures to decrease the likelihood of
# clients churn

w = pd.Series(model.coef_[0], index=X.columns.values)
w.sort_values(ascending=True)

size_household                     -8.908995e-01
insurance_house_premium_per_year   -4.396566e-03
income_deposits_per_year            8.442588e-07
income_securities_per_year          1.357271e-05
cash_withdraws_per_month            3.386978e-01
main_account_flag                   1.874101e+00
dtype: float64