## Classification Tree on Unbalanced Dataset

• **MyBank** executed a campaign to cross sell Personal Loans to their existing customers.

• **Campaign offer** : Loan at attractive interest rate of only 11%. Almost 1% lower than the prevailing
market rate. No processing fee charges.

• A **Pilot campaign** targeting 20,000 customers was rolled out

• **1,733 out of 20,000** (i.e. **8.67%**) customers responded to the campaign offer

• Data is given in 2 files: **Dev_Sample.csv**(14,000 records) & **Holdout_Sample.csv** (6,000 records)

        • Dev_Sample Target Rate = 1235 / 14000 = 8.8%
        
        • Hold out sample Target Rate = 498 / 14000 = 8.3%
        
• **Modeling Objective**: Build Classification Model to identify profile of Campaign Responders vs.
Non Responders

In [57]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd


#Load the Dataset
CTDF_dev = pd.read_csv("DEV_SAMPLE.csv")
CTDF_holdout = pd.read_csv("HOLDOUT_SAMPLE.csv")

print( len(CTDF_dev),  len(CTDF_holdout))
CTDF_dev.head()


14000 6000


Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31


In [58]:
#Data Preprocessing
#Splitting into features and response variables
X =  CTDF_dev[['Age', 'Gender', 'Balance', 'Occupation',
               'No_OF_CR_TXNS', 'AGE_BKT', 'SCR', 'Holding_Period']]


In [59]:
#Categorical Variable to Numerical Variables
X_train = pd.get_dummies(X)
X_train.columns

Index(['Age', 'Balance', 'No_OF_CR_TXNS', 'SCR', 'Holding_Period', 'Gender_F',
       'Gender_M', 'Gender_O', 'Occupation_PROF', 'Occupation_SAL',
       'Occupation_SELF-EMP', 'Occupation_SENP', 'AGE_BKT_26-30',
       'AGE_BKT_31-35', 'AGE_BKT_36-40', 'AGE_BKT_41-45', 'AGE_BKT_46-50',
       'AGE_BKT_<25', 'AGE_BKT_>50'],
      dtype='object')

In [60]:
y_train = CTDF_dev["Target"]

In [61]:
print (type(X_train) , type(y_train))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [62]:
#Decision Tree
#Loading the library
from sklearn.tree import DecisionTreeClassifier

#Setting the parameter
clf = DecisionTreeClassifier(criterion = "gini" , 
                             min_samples_split = 100,
                             min_samples_leaf = 10,
                             max_depth = 50)

#Calling the fit function to built the tree
clf.fit(X_train,y_train)

In [93]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from io import StringIO
import pydot

dot_data = StringIO()
feature_list = list(X_train.columns.values)
export_graphviz(clf, 
                out_file = dot_data, 
                feature_names = feature_list,
               filled=True)
graph=pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("classification_tree_output.pdf")

In [80]:
Nodes = pd.DataFrame(clf.tree_.__getstate__()["nodes"])
Nodes


Unnamed: 0,left_child,right_child,feature,threshold,impurity,n_node_samples,weighted_n_node_samples
0,1,194,4,10.5,0.160865,14000,14000.0
1,2,159,10,0.5,0.274723,4818,4818.0
2,3,112,2,20.5,0.237183,3971,3971.0
3,4,13,1,10853.5,0.198811,2832,2832.0
4,5,6,3,368.0,0.327605,281,281.0
...,...,...,...,...,...,...,...
456,-1,-1,-2,-2.0,0.423440,23,23.0
457,-1,-1,-2,-2.0,0.293850,95,95.0
458,459,460,3,304.0,0.435425,128,128.0
459,-1,-1,-2,-2.0,0.218750,32,32.0


In [89]:
# Create the DataFrame with feature names and corresponding importances
feature_importance = pd.DataFrame([X_train.columns,
                               clf.tree_.compute_feature_importances()]).T

# Set column names for better clarity
feature_importance.columns = ['Feature', 'Importance']

# Sort the DataFrame by importance in descending order
feature_importance_sorted = feature_importance.sort_values(by='Importance', ascending=False)

# Display the sorted DataFrame
print(feature_importance_sorted)

                Feature Importance
4        Holding_Period   0.228603
3                   SCR    0.16924
1               Balance   0.139184
2         No_OF_CR_TXNS   0.127342
11      Occupation_SENP   0.094942
10  Occupation_SELF-EMP   0.080528
0                   Age   0.055788
9        Occupation_SAL   0.027169
5              Gender_F   0.026587
15        AGE_BKT_41-45    0.01459
14        AGE_BKT_36-40   0.012152
6              Gender_M   0.010949
13        AGE_BKT_31-35   0.006404
16        AGE_BKT_46-50   0.004963
7              Gender_O   0.000933
12        AGE_BKT_26-30   0.000627
8       Occupation_PROF        0.0
17          AGE_BKT_<25        0.0
18          AGE_BKT_>50        0.0


In [66]:
## Let us see how good is the model
pred_y_train = clf.predict(X_train )
pred_y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [67]:
## Let us see the classification accuracy of our model
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
score = accuracy_score(y_train, pred_y_train)
score

0.9170714285714285

In [68]:
y_train_prob = clf.predict_proba(X_train)

## AUC
fpr, tpr, thresholds = roc_curve(y_train, y_train_prob[:,1])
auc(fpr, tpr)

0.8786528827718758

In [69]:
## Let us see how good is the model
X_holdout =  CTDF_holdout[['Age', 'Gender', 'Balance', 'Occupation',
               'No_OF_CR_TXNS', 'AGE_BKT', 'SCR', 'Holding_Period']]
X_test = pd.get_dummies(X_holdout)
y_test = CTDF_holdout["Target"]

In [70]:
pred_y_test = clf.predict(X_test)
score_h = accuracy_score(y_test, pred_y_test)
score_h

0.9188333333333333

In [71]:
y_test_prob = clf.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
auc(fpr, tpr)

0.7292934004283218

In [72]:
#Cross validation function
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train , y_train, cv = 10, scoring='roc_auc')
print(scores.mean())
print(scores.std())

0.7344294604101136
0.02991924881596745


In [73]:
y_train_prob = clf.predict_proba(X_train)
fpr, tpr, thresholds = roc_curve(y_train, y_train_prob[:,1])
auc(fpr, tpr)

0.8786528827718758

In [74]:
y_test_prob = clf.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
auc(fpr, tpr)


0.7292934004283218

In [76]:
## Tuning the Classifier using GridSearchCV
from sklearn.model_selection import GridSearchCV
#help(GridSearchCV)

param_dist = {"criterion": ["gini","entropy"],
              "max_depth": np.arange(3,10),
              }

tree = DecisionTreeClassifier(min_samples_split = 100,
                             min_samples_leaf = 10)

tree_cv  = GridSearchCV(tree, param_dist, cv = 10, 
                        scoring = 'roc_auc', verbose = 100)

tree_cv.fit(X_train,y_train)

Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV 1/10; 1/14] START criterion=gini, max_depth=3...............................
[CV 1/10; 1/14] END criterion=gini, max_depth=3;, score=0.737 total time=   0.0s
[CV 2/10; 1/14] START criterion=gini, max_depth=3...............................
[CV 2/10; 1/14] END criterion=gini, max_depth=3;, score=0.726 total time=   0.0s
[CV 3/10; 1/14] START criterion=gini, max_depth=3...............................
[CV 3/10; 1/14] END criterion=gini, max_depth=3;, score=0.712 total time=   0.0s
[CV 4/10; 1/14] START criterion=gini, max_depth=3...............................
[CV 4/10; 1/14] END criterion=gini, max_depth=3;, score=0.749 total time=   0.0s
[CV 5/10; 1/14] START criterion=gini, max_depth=3...............................
[CV 5/10; 1/14] END criterion=gini, max_depth=3;, score=0.776 total time=   0.0s
[CV 6/10; 1/14] START criterion=gini, max_depth=3...............................
[CV 6/10; 1/14] END criterion=gini, max_depth=

[CV 3/10; 6/14] END criterion=gini, max_depth=8;, score=0.729 total time=   0.0s
[CV 4/10; 6/14] START criterion=gini, max_depth=8...............................
[CV 4/10; 6/14] END criterion=gini, max_depth=8;, score=0.775 total time=   0.0s
[CV 5/10; 6/14] START criterion=gini, max_depth=8...............................
[CV 5/10; 6/14] END criterion=gini, max_depth=8;, score=0.800 total time=   0.0s
[CV 6/10; 6/14] START criterion=gini, max_depth=8...............................
[CV 6/10; 6/14] END criterion=gini, max_depth=8;, score=0.754 total time=   0.0s
[CV 7/10; 6/14] START criterion=gini, max_depth=8...............................
[CV 7/10; 6/14] END criterion=gini, max_depth=8;, score=0.726 total time=   0.0s
[CV 8/10; 6/14] START criterion=gini, max_depth=8...............................
[CV 8/10; 6/14] END criterion=gini, max_depth=8;, score=0.765 total time=   0.0s
[CV 9/10; 6/14] START criterion=gini, max_depth=8...............................
[CV 9/10; 6/14] END criterio

[CV 5/10; 11/14] END criterion=entropy, max_depth=6;, score=0.791 total time=   0.0s
[CV 6/10; 11/14] START criterion=entropy, max_depth=6...........................
[CV 6/10; 11/14] END criterion=entropy, max_depth=6;, score=0.771 total time=   0.0s
[CV 7/10; 11/14] START criterion=entropy, max_depth=6...........................
[CV 7/10; 11/14] END criterion=entropy, max_depth=6;, score=0.746 total time=   0.0s
[CV 8/10; 11/14] START criterion=entropy, max_depth=6...........................
[CV 8/10; 11/14] END criterion=entropy, max_depth=6;, score=0.771 total time=   0.0s
[CV 9/10; 11/14] START criterion=entropy, max_depth=6...........................
[CV 9/10; 11/14] END criterion=entropy, max_depth=6;, score=0.785 total time=   0.0s
[CV 10/10; 11/14] START criterion=entropy, max_depth=6..........................
[CV 10/10; 11/14] END criterion=entropy, max_depth=6;, score=0.783 total time=   0.0s
[CV 1/10; 12/14] START criterion=entropy, max_depth=7...........................
[CV

In [77]:
## Building the model using best combination of parameters
print("Tuned Decision Tree parameter : {}".format(tree_cv.best_params_))

classifier = tree_cv.best_estimator_

classifier.fit(X_train,y_train)


Tuned Decision Tree parameter : {'criterion': 'gini', 'max_depth': 6}


In [78]:
#predicting probabilities
y_train_prob = classifier.predict_proba(X_train)
fpr, tpr, thresholds = roc_curve(y_train, y_train_prob[:,1])
auc_d = auc(fpr, tpr)
auc_d

0.8009914825933133

In [79]:
y_test_prob = classifier.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
auc_h = auc(fpr, tpr)
auc_d

0.8009914825933133