In [1]:
# Import of libraries.
# Pandas offers in particular data structures and operations for manipulating digital tables and time series
import pandas as pd
import numpy as np

In [3]:
# Import our dataset
data = pd.read_csv('prediction_de_fraud_2.csv')

In [4]:
# Creation of predictive data and data to predict. 
# characteristics => predictive data
# to_predict => data to predict

# axis=1 means that we want to drop the column with the name 'isFraud'
# The values are simply a familiar NumPy array
characteristics = data.drop('isFraud', axis=1).values
to_predict = data['isFraud'].values

In [5]:
# We will change change the categorical variable to numeric variable.
# To do that we use the LabelEncoder 
from sklearn.preprocessing import LabelEncoder

In [6]:
# Apply the labelencoder to the different fields
labEnc_x = LabelEncoder()

In [7]:
characteristics[:,1] = labEnc_x.fit_transform(characteristics[:,1])
characteristics[:,3] = labEnc_x.fit_transform(characteristics[:,3])
characteristics[:,6] = labEnc_x.fit_transform(characteristics[:,6]) 

In [8]:
# Split the dataset into train data in test data
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(characteristics, to_predict, test_size=0.3, random_state=42, stratify=to_predict)

In [10]:
# import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [11]:
# Initialization of the random drill with default parameters
random_forest_classifier = RandomForestClassifier(random_state=50)

In [12]:
# fit the data to our model
random_forest_classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=50, verbose=0,
                       warm_start=False)

In [13]:
# let's assess the accuracy of our model from the test data.
result_score = random_forest_classifier.score(X_test, Y_test)
print(result_score)

0.9550561797752809


In [14]:
# Search for optimal parameters with RandomizeSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [29]:
# We use a dictionary of hyper parameter values.
grid_params = {
    # n_estimators is the number of trees to be used in the forest
    'n_estimators':[100,200,300,400,500],
    # max number of levels in each decision tree
    'max_depth':[1,2,4,6,8],
    # specifies the minimum number of samples that should be present in the leaf node after splitting a node
    'min_samples_leaf':[0.05, 0.1, 0.2]
}

In [30]:
import parallelTest
import multiprocessing as mp

In [31]:
if __name__ == '__main__':
    extractor = parallelTest.ParallelExtractor()
    extractor.runInParallel(numProcesses=2, numThreads=4)
    rf_random = RandomizedSearchCV(estimator = random_forest_classifier, param_distributions = grid_params, n_iter = 1, cv=3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.9s finished


In [32]:
# Let's see the the best optimum hypermarameters
rf_random.best_params_

{'n_estimators': 500, 'min_samples_leaf': 0.05, 'max_depth': 1}