In [1]:
# import the packages
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [3]:
# Upload the dataset
dataset = pd.read_csv('prediction_de_fraud.csv')

In [4]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [6]:
dataset.isnull

<bound method DataFrame.isnull of          step      type      amount     nameOrig  oldbalanceOrg  \
0           1   PAYMENT     9839.64  C1231006815      170136.00   
1           1   PAYMENT     1864.28  C1666544295       21249.00   
2           1  TRANSFER      181.00  C1305486145         181.00   
3           1  CASH_OUT      181.00   C840083671         181.00   
4           1   PAYMENT    11668.14  C2048537720       41554.00   
...       ...       ...         ...          ...            ...   
1421218   743  CASH_OUT   339682.13   C786484425      339682.13   
1421219   743  TRANSFER  6311409.28  C1529008245     6311409.28   
1421220   743  CASH_OUT  6311409.28  C1162922333     6311409.28   
1421221   743  TRANSFER   850002.52  C1685995037      850002.52   
1421222   743  CASH_OUT   850002.52  C1280323807      850002.52   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
0             160296.36  M1979787155            0.00            0.00        0   

In [7]:
# Retrieve prediction data
X = dataset.drop('isFraud', axis=1).values
# Prediction data
target = dataset['isFraud'].values

In [8]:
# Change categorical variables to numeric variable
labEnc_x = LabelEncoder()

In [9]:
X[:,1] = labEnc_x.fit_transform(X[:,1])
X[:,3] = labEnc_x.fit_transform(X[:,3])
X[:,6] = labEnc_x.fit_transform(X[:,6])

In [10]:
# Separate the data sets into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3,random_state=42, stratify = target)

In [11]:
# Initialize our classifier object
dec_tree = DecisionTreeClassifier(criterion = 'gini', random_state=50)

In [12]:
# adjust the model to the training data.
dec_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=50, splitter='best')

In [13]:
# get score
dec_tree.score(X_test, y_test)

0.9992869992283643

In [14]:
# Assessed accuracy on test data
y_pred = dec_tree.predict(X_test)

In [15]:
pd.DataFrame(y_pred, y_test).head(10)

Unnamed: 0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0


In [16]:
# Evaluation with the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[425153    130]
 [   174    910]]


In [17]:
# Control the hyperparameter
'''
The parameter max_depth is a threshold on the maximum depth of the tree parameter allows to control the over-learning. 
The min_samples_leaf parameter gives the minimum number of samples in a leaf node. 
They make it possible to put constraints on the construction of the tree and therefore 
to indirectly control the phenomenon of over-learning.
'''
grid_params = {
    'max_depth': [1, 2, 3, 4, 5, 6],
    'min_samples_leaf': [0.02, 0.04, 0.06, 0.08]
}

In [18]:
# Grid for finding objects
grid_object = GridSearchCV(estimator = dec_tree, param_grid = grid_params, scoring='accuracy', cv=10)

In [19]:
# Fit grid objects to data
grid_object.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=50,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6],
                     

In [20]:
grid_object.best_params_

{'max_depth': 1, 'min_samples_leaf': 0.02}