In [0]:
# Import of libraries.
# Pandas offers in particular data structures and operations for manipulating digital tables and time series
import pandas as pd
import numpy as np

In [0]:
# Import our dataset
data = pd.read_csv('prediction_de_fraud_2.csv')

In [0]:
# Let's see what our data looks like
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,0
4,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0


In [0]:
print(data.columns)

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')


In [0]:
data.shape

(295, 11)

In [0]:
data.isnull

<bound method DataFrame.isnull of      step      type      amount  ... newbalanceDest  isFraud  isFlaggedFraud
0       1   PAYMENT     9839.64  ...           0.00        0               0
1       1   PAYMENT     1864.28  ...           0.00        0               0
2       1  TRANSFER     2806.00  ...           0.00        1               0
3       1  CASH_OUT     2806.00  ...           0.00        1               0
4       1  TRANSFER      181.00  ...           0.00        1               0
..    ...       ...         ...  ...            ...      ...             ...
290   743  CASH_OUT   339682.13  ...      339682.13        1               0
291   743  TRANSFER  6311409.28  ...           0.00        1               0
292   743  CASH_OUT  6311409.28  ...     6379898.11        1               0
293   743  TRANSFER   850002.52  ...           0.00        1               0
294   743  CASH_OUT   850002.52  ...     7360101.63        1               0

[295 rows x 11 columns]>

In [0]:
# Creation of predictive data and data to predict. 
# characteristics => predictive data
# to_predict => data to predict

# axis=1 means that we want to drop the column with the name 'isFraud'
# The values are simply a familiar NumPy array
characteristics = data.drop('isFraud', axis=1).values
to_predict = data['isFraud'].values

In [0]:
characteristics

array([[1, 'PAYMENT', 9839.64, ..., 0.0, 0.0, 0],
       [1, 'PAYMENT', 1864.28, ..., 0.0, 0.0, 0],
       [1, 'TRANSFER', 2806.0, ..., 0.0, 0.0, 0],
       ...,
       [743, 'CASH_OUT', 6311409.28, ..., 68488.84, 6379898.11, 0],
       [743, 'TRANSFER', 850002.52, ..., 0.0, 0.0, 0],
       [743, 'CASH_OUT', 850002.52, ..., 6510099.11, 7360101.63, 0]],
      dtype=object)

In [0]:
to_predict

array([0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1])

In [0]:
# We will change change the categorical variable to numeric variable.
# To do that we use the LabelEncoder 
from sklearn.preprocessing import LabelEncoder

In [0]:
# Let's display the non-numeric variables
resultat = data.select_dtypes(include=['object'])
print(resultat)

         type     nameOrig     nameDest
0     PAYMENT  C1231006815  M1979787155
1     PAYMENT  C1666544295  M2044282225
2    TRANSFER  C1420196421   C972765878
3    CASH_OUT  C2101527076  C1007251739
4    TRANSFER  C1305486145   C553264065
..        ...          ...          ...
290  CASH_OUT   C786484425   C776919290
291  TRANSFER  C1529008245  C1881841831
292  CASH_OUT  C1162922333  C1365125890
293  TRANSFER  C1685995037  C2080388513
294  CASH_OUT  C1280323807   C873221189

[295 rows x 3 columns]


In [0]:
# if you want to see index for each column
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

In [0]:
# Apply the labelencoder to the different fields
labEnc_x = LabelEncoder()
characteristics[:]

array([[1, 'PAYMENT', 9839.64, ..., 0.0, 0.0, 0],
       [1, 'PAYMENT', 1864.28, ..., 0.0, 0.0, 0],
       [1, 'TRANSFER', 2806.0, ..., 0.0, 0.0, 0],
       ...,
       [743, 'CASH_OUT', 6311409.28, ..., 68488.84, 6379898.11, 0],
       [743, 'TRANSFER', 850002.52, ..., 0.0, 0.0, 0],
       [743, 'CASH_OUT', 850002.52, ..., 6510099.11, 7360101.63, 0]],
      dtype=object)

In [0]:
column_index(data, ['type', 'nameOrig', 'nameDest'])

array([1, 3, 6])

In [0]:
characteristics[:,1] = labEnc_x.fit_transform(characteristics[:,1])
characteristics[:,3] = labEnc_x.fit_transform(characteristics[:,3])
characteristics[:,6] = labEnc_x.fit_transform(characteristics[:,6]) 

In [0]:
# Split the dataset into train data in test data
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(characteristics, to_predict, test_size=0.3, random_state=42, stratify=to_predict)

In [0]:
# import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [0]:
# Initialization of the random drill with default parameters
random_forest_classifier = RandomForestClassifier(random_state=50)

In [0]:
# fit the data to our model
random_forest_classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=50, verbose=0,
                       warm_start=False)

In [0]:
# let's assess the accuracy of our model from the test data.
result_score = random_forest_classifier.score(X_test, Y_test)
print(result_score)

0.9550561797752809


In [0]:
y_predict = random_forest_classifier.predict(X_test)

In [0]:
# Let's get a look
print(pd.DataFrame(Y_test, y_predict))

    0
0   0
1   1
0   0
0   0
0   0
.. ..
0   0
0   0
0   0
0   0
0   0

[89 rows x 1 columns]


In [0]:
# Let's evaluate our model with the confusion matrix
# import the confusion_matrix libraries
from sklearn.metrics import confusion_matrix

In [0]:
cm = confusion_matrix(Y_test, y_predict)

In [0]:
# Let's see the cm
cm

array([[76,  2],
       [ 2,  9]])

In [0]:
# We gave 76 case of non fraud that have been correctly detected
# we have 8 case of fraud that have been correctly detected