In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('abcd.csv')

In [3]:
df.columns

Index(['Steps_Before', 'Hour', 'Date', 'DOW', 'Is_Weekend', 'Abandon_Ratio',
       'Avg_Sess_Per_Month', 'Is_AB'],
      dtype='object')

In [4]:
df.head(5)

Unnamed: 0,Steps_Before,Hour,Date,DOW,Is_Weekend,Abandon_Ratio,Avg_Sess_Per_Month,Is_AB
0,7,13,23,4,0.0,0.5,0.8,1.0
1,14,7,21,2,0.0,1.0,0.2,1.0
2,24,6,24,5,0.0,0.35,4.6,1.0
3,8,6,24,5,0.0,0.35,4.6,1.0
4,8,6,24,5,0.0,0.35,4.6,1.0


In [5]:
df.shape

(2296, 8)

In [6]:
x = df[['Steps_Before','Hour','Date','DOW','Is_Weekend','Abandon_Ratio','Avg_Sess_Per_Month']]
y = df['Is_AB']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=5)

# Gini vs Entropy

In [64]:
clf_gini = DecisionTreeClassifier(criterion='gini')

In [65]:
clf_entropy = DecisionTreeClassifier(criterion='entropy')

In [66]:
clf_gini.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [67]:
Actual = y_test
Predicted = clf_gini.predict(X_test)

print(metrics.accuracy_score(Actual, Predicted)*100)

71.602787456446


In [68]:
clf_entropy.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [69]:
Predicted = clf_entropy.predict(X_test)

print(metrics.accuracy_score(Actual, Predicted)*100)

75.60975609756098


# Variation with max_depth

In [70]:
Actual = y_test
#clf = DecisionTreeClassifier(criterion='entropy',max_depth=j)

for j in range(1,15):
    clf = DecisionTreeClassifier(criterion='entropy',max_depth=j,random_state=5)
    clf.fit(X_train,y_train)
    Predicted = clf.predict(X_test)
    print(metrics.accuracy_score(Actual, Predicted)*100,j)

82.40418118466899 1
82.40418118466899 2
81.18466898954703 3
83.44947735191639 4
80.8362369337979 5
80.66202090592334 6
79.61672473867596 7
80.48780487804879 8
78.397212543554 9
78.57142857142857 10
78.57142857142857 11
79.44250871080139 12
77.00348432055749 13
77.00348432055749 14


# Variation with Max_leaf_node

In [71]:
for j in range(5,25):
    clf = DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=j,random_state=5)
    clf.fit(X_train,y_train)
    Predicted = clf.predict(X_test)
    print(metrics.accuracy_score(Actual, Predicted)*100,j)

82.40418118466899 5
81.3588850174216 6
81.3588850174216 7
83.62369337979094 8
83.62369337979094 9
82.40418118466899 10
82.40418118466899 11
82.40418118466899 12
81.53310104529616 13
81.3588850174216 14
81.3588850174216 15
80.8362369337979 16
80.8362369337979 17
80.8362369337979 18
80.8362369337979 19
80.8362369337979 20
80.8362369337979 21
80.8362369337979 22
80.8362369337979 23
80.8362369337979 24


# Optimized Decision Tree (criterion='entropy',max_depth=4,max_leaf_nodes=8)

In [72]:
clf = DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=8)

clf.fit(X_train,y_train)
Predicted = clf.predict(X_test)
print(metrics.accuracy_score(Actual, Predicted)*100)

83.62369337979094


# Bagging

In [73]:
from sklearn.ensemble import BaggingClassifier

In [74]:
for i in range(5,20):
        bagg = BaggingClassifier(base_estimator=clf,n_estimators=i,random_state=5)
        bagg.fit(X_train,y_train)
        Predicted = bagg.predict(X_test)
        print(metrics.accuracy_score(Actual, Predicted)*100,i)

81.53310104529616 5
81.53310104529616 6
81.01045296167247 7
81.53310104529616 8
81.01045296167247 9
81.53310104529616 10
81.53310104529616 11
81.53310104529616 12
81.01045296167247 13
81.70731707317073 14
81.18466898954703 15
81.53310104529616 16
80.8362369337979 17
81.18466898954703 18
81.3588850174216 19


# Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier

In [76]:
Actual = y_test
#clf = RandomForestClassifier(n_jobs=2, random_state=0)
#for i in range(5,15):
clf = RandomForestClassifier(criterion = 'entropy',max_depth=4,max_leaf_nodes=8,random_state=5)
clf.fit(X_train,y_train)
Predicted = clf.predict(X_test)
print(metrics.accuracy_score(Actual, Predicted)*100)

82.22996515679442
