In [14]:
# Lets import some dependencies first
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.model_selection import GridSearchCV,cross_val_score

pd.set_option("display.precision",2)
pd.set_option("display.max_columns",None)

from sklearn.metrics import roc_auc_score
import pickle

In [15]:
# loading the dataset
df = pd.read_pickle('./pickled/Train Set All Numeric')

In [16]:
df.head(10)

Unnamed: 0,index,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Together,Marital Status_Widow
0,1998,5370,1973,4.0,32644.0,1,0,3496,38,239,3,141,0,7,23,4,7,1,6,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
1,814,6303,1986,4.0,91820.0,0,0,3185,72,410,73,747,76,161,30,0,5,5,12,1,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
2,1513,9264,1986,1.0,79529.0,0,0,3030,1,423,42,706,73,197,197,1,4,8,9,2,0,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
3,1381,7514,1956,3.0,54342.0,1,1,3446,74,84,10,34,11,10,28,4,3,1,4,6,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
4,919,4637,1954,4.0,74637.0,0,0,3374,73,960,64,464,146,0,16,1,6,9,9,3,0,0,0,1,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0
5,902,6445,1967,1.0,66825.0,0,0,3317,73,243,101,405,29,40,40,1,4,5,6,2,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
6,1205,1118,1956,2.0,50965.0,0,1,3461,87,544,13,85,8,6,29,3,10,4,5,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
7,173,1880,1959,4.0,53537.0,1,1,3117,17,81,0,6,0,0,6,2,2,1,3,5,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
8,1557,4037,1976,1.0,31859.0,1,0,3347,77,3,1,3,8,0,5,1,1,0,2,7,0,0,0,0,0,0,3,11,0,1.0,0.0,0.0,0.0,0.0
9,1216,232,1965,1.0,61559.0,0,1,3314,8,279,83,88,32,14,34,1,4,2,10,3,0,0,0,0,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0


Lets seperate Train Test(Validation) Set

In [17]:
df['Target'] = df['AcceptedCmp1']+df['AcceptedCmp2']+df['AcceptedCmp3'] + \
    df['AcceptedCmp4']+df['AcceptedCmp5']+df['Response']

In [18]:
df['Target'].replace(
    df[df['Target']>0]['Target'].values.tolist(),
    1,
    inplace = True
)

In [19]:
df.Target.value_counts()

0    1302
1     468
Name: Target, dtype: int64

Lets define our X and y

In [20]:
df.columns

Index(['index', 'ID', 'Year_Birth', 'Education', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Marital Status_Divorced', 'Marital Status_Married',
       'Marital Status_Single', 'Marital Status_Together',
       'Marital Status_Widow', 'Target'],
      dtype='object')

In [21]:
X = df.drop(columns=['index', 'ID', 'AcceptedCmp1', 'AcceptedCmp2',
                     'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'Target'])
y = df['Target']

In [22]:
X.shape,y.shape

((1770, 26), (1770,))

Lets make Train,Test(Validation) Split

In [23]:
X_train, X_test, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [24]:
X_train.shape , y_valid.shape

((1327, 26), (443,))

I'm gonna use SKF to preserve the class distribution while cross validation

In [25]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

Initialize the Decision Tree

In [26]:
Dtree = DecisionTreeClassifier(random_state=42)

In [27]:
Dtree.fit(X_train,y_train)

In [28]:
cross_val_score(Dtree,X,y,cv=skf,scoring='roc_auc').mean()

0.6618210809286591

In [29]:
# accuracy on the test set
Dtree_pred = Dtree.predict(X_test)
roc_auc_score(y_valid,Dtree_pred)

0.6454302343872896