In [413]:
# importing the dependencies
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.model_selection import GridSearchCV,cross_val_score

from sklearn.metrics import roc_auc_score
import pickle

In [414]:
# Loading the dataset
df = pd.read_pickle('../DataFrames/all_data_non_encoded.pkl')

In [415]:
# seperating only the train set
train_set,test_set =  df.iloc[:28322],df.iloc[28322:]

In [416]:
# Making X and y
X = train_set.drop(columns=['label','id'],axis=1)
y = train_set['label']

In [417]:
X.shape, y.shape

((28322, 28), (28322,))

In [418]:
X.head()

Unnamed: 0,gender,s11,s12,s13,s16,s17,s18,s48,s52,s58,...,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15
0,M,Y,N,1,D,D,B,0,1,B,...,0.017176,-9.126056,1.732291,3.698504,4.804517,1.544484,0,0,0.63122,5
1,M,Y,Y,1,D,D,B,1,1,B,...,0.013857,-9.098287,1.505885,6.791357,6.110416,1.712354,0,0,0.392746,3
2,M,Y,Y,1,D,D,B,0,1,B,...,0.013943,-9.234894,1.503828,4.109685,3.953226,1.80426,0,0,0.222537,2
3,F,Y,Y,1,D,D,B,0,1,B,...,0.010387,-9.378025,1.485863,7.265876,4.559419,1.537645,0,0,0.154409,4
4,M,N,Y,1,B,D,D,1,l,B,...,0.016289,-9.261962,1.61921,3.737647,4.052003,1.637831,0,1,0.73756,1


In [419]:
y.head()

0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: label, dtype: float64

Making Train,Test,split

In [420]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.3,random_state=42,shuffle=True
)

### Preprocessing: 
Label Encoding the Categorical Variables

In [421]:
categorical = ['gender', 's11', 's12', 's16', 's17', 's18', 's52', 's58', 's69','s70', 's71']
le = LabelEncoder()
# X_train['gender'] = le.fit_transform(X_train['gender'])
# X_test['gender'] = le.transform(X_test['gender'])
# X_train['s11'] = le.fit_transform(X_train['s11'])
# X_test['s11'] = le.transform(X_test['s11'])
# X_train['s12'] = le.fit_transform(X_train['s12'])
# X_test['s12'] = le.transform(X_test['s12'])
# X_train['s16'] = le.fit_transform(X_train['s16'])
# X_test['s16'] = le.transform(X_test['s16'])
# X_train['s17'] = le.fit_transform(X_train['s17'])
# X_test['s17'] = le.transform(X_test['s17'])
# X_train['s18'] = le.fit_transform(X_train['s18'])
# X_test['s18'] = le.transform(X_test['s18'])
# X_train['s52'] = le.fit_transform(X_train['s52'])
# X_test['s52'] = le.transform(X_test['s52'])
# X_train['s58'] = le.fit_transform(X_train['s58'])
# X_test['s58'] = le.transform(X_test['s58'])
# X_train['s69'] = le.fit_transform(X_train['s69'])
# X_test['s69'] = le.transform(X_test['s69'])
# X_train['s70'] = le.fit_transform(X_train['s70'])
# X_test['s70'] = le.transform(X_test['s70'])
# X_train['s71'] = le.fit_transform(X_train['s71'])
# X_test['s71'] = le.transform(X_test['s71'])
for cols in categorical:
    X_train[cols] = le.fit_transform(X_train[cols])
    X_test[cols] = le.transform(X_test[cols])

In [422]:
X_test.head()

Unnamed: 0,gender,s11,s12,s13,s16,s17,s18,s48,s52,s58,...,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15
17757,1,1,1,1,3,3,1,1,1,1,...,0.006795,-9.071559,1.520636,5.47317,9.376441,1.678646,0,1,0.713513,1
23215,0,1,1,1,3,3,1,1,2,1,...,0.015456,-9.213176,1.516007,4.272529,4.239801,1.53087,0,0,0.295888,2
1122,1,1,1,1,3,3,1,0,1,1,...,0.021261,-9.351339,1.557345,3.95236,4.045705,1.803043,0,0,0.050592,2
9406,1,1,1,1,1,3,1,1,2,1,...,0.006206,-8.971474,1.933557,10.14082,11.226175,1.739224,0,0,0.952721,4
10333,0,1,1,1,3,3,1,1,1,0,...,0.004268,-9.059337,1.484203,8.23503,7.906341,1.542611,0,0,0.485491,5


Gonna use SKF to preserve the class Distribution

In [423]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
Dtree = DecisionTreeClassifier(random_state=42,max_depth=5)

In [424]:
Dtree.fit(X_train,y_train)

In [425]:
cross_val_score(Dtree,X_train,y_train,cv=skf,scoring='roc_auc').mean()

0.8440882660750171

In [426]:
# accuracy on the test set
Dtree_pred = Dtree.predict(X_test)
roc_auc_score(y_test,Dtree_pred)

0.6867969979450651