In [None]:
#Create a decision tree for bank customer churn prediction dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('Bank Customer Churn Prediction.csv')

In [11]:
df.dtypes

customer_id           int64
credit_score          int64
country              object
gender               object
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
dtype: object

In [13]:
features = ['credit_score','country','gender','age','balance','estimated_salary','tenure']
target = 'churn'

In [17]:
#Encoding 
encoder = LabelEncoder()
for col in features:
    if df[col].dtype == object:
        df[col] = encoder.fit_transform(df[col])
df

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,0,0,42,2,0.00,1,1,1,101348.88,1
1,15647311,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,0,0,42,8,159660.80,3,1,0,113931.57,1
3,15701354,699,0,0,39,1,0.00,2,0,0,93826.63,0
4,15737888,850,2,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,0,1,39,5,0.00,2,1,0,96270.64,0
9996,15569892,516,0,1,35,10,57369.61,1,1,1,101699.77,0
9997,15584532,709,0,0,36,7,0.00,1,0,1,42085.58,1
9998,15682355,772,1,1,42,3,75075.31,2,1,0,92888.52,1


In [23]:
# Scalling the data
scaler = StandardScaler()
df[['age','estimated_salary','balance','credit_score']] = scaler.fit_transform(df[['age','estimated_salary','balance','credit_score']])

In [25]:
df

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,-0.326221,0,0,0.293517,2,-1.225848,1,1,1,0.021886,1
1,15647311,-0.440036,2,0,0.198164,1,0.117350,1,0,1,0.216534,0
2,15619304,-1.536794,0,0,0.293517,8,1.333053,3,1,0,0.240687,1
3,15701354,0.501521,0,0,0.007457,1,-1.225848,2,0,0,-0.108918,0
4,15737888,2.063884,2,0,0.388871,2,0.785728,1,1,1,-0.365276,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,1.246488,0,1,0.007457,5,-1.225848,2,1,0,-0.066419,0
9996,15569892,-1.391939,0,1,-0.373958,10,-0.306379,1,1,1,0.027988,0
9997,15584532,0.604988,0,0,-0.278604,7,-1.225848,1,0,1,-1.008643,1
9998,15682355,1.256835,1,1,0.293517,3,-0.022608,2,1,0,-0.125231,1


In [27]:
X = df[features]
y = df[target]

In [29]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [31]:
model = DecisionTreeClassifier()
model = model.fit(X_train,Y_train)

In [33]:
y_pred = model.predict(X_test)

In [35]:
y_pred

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [37]:
from sklearn.metrics  import confusion_matrix
from sklearn import metrics

In [39]:
conf_matrix = confusion_matrix(Y_test,y_pred)
conf_matrix

array([[1294,  291],
       [ 252,  163]], dtype=int64)

In [41]:
accuracy = metrics.accuracy_score(Y_test,y_pred)

In [43]:
accuracy

0.7285

In [47]:
recall = metrics.recall_score(Y_test,y_pred)
recall

0.3927710843373494

In [49]:
#Print various classification metrics
print('Confusion Matrix : ', confusion_matrix(Y_test, y_pred))
print('Accuracy: ', metrics.accuracy_score(Y_test, y_pred))
print('Precision Score: ', metrics.precision_score(Y_test, y_pred))
print('Recall  Score: ', metrics.recall_score(Y_test, y_pred))
print('F1 Score: ', metrics.f1_score(Y_test, y_pred))
print('Classification Report: ', metrics.classification_report(Y_test, y_pred))

Confusion Matrix :  [[1294  291]
 [ 252  163]]
Accuracy:  0.7285
Precision Score:  0.3590308370044053
Recall  Score:  0.3927710843373494
F1 Score:  0.3751438434982739
Classification Report:                precision    recall  f1-score   support

           0       0.84      0.82      0.83      1585
           1       0.36      0.39      0.38       415

    accuracy                           0.73      2000
   macro avg       0.60      0.60      0.60      2000
weighted avg       0.74      0.73      0.73      2000



In [51]:
print('Feature Importance')
for feature, importance in zip(features, model.feature_importances_):
    print(f'{feature} :{importance :.2f} ')

Feature Importance
credit_score :0.21 
country :0.03 
gender :0.02 
age :0.23 
balance :0.18 
estimated_salary :0.23 
tenure :0.10 


In [55]:
#Save the tree as image 
#Three options
#OPtion 1 - Create dot file and then convert manually to png format
#Save the tree as DOT file 
with open('customer_churn.dot' , 'w') as f:
    export_graphviz(model , out_file = f, feature_names=features, filled = True)
#You have to say dot -Tpng titanic_tree.dot -o titanic_tree.png to convert 
