## Random Forest Classifier
# Credit Fraud Dataset

In [25]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('C:/Users/admin/Downloads/credit_dataset.csv')

In [3]:
# Converting into type int for simplicity
df['FAMILY SIZE'] = df['FAMILY SIZE'].astype(int)

In [4]:
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_df = df.copy()
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
for row in object_cols:
    label_df[row] = le.fit_transform(df[row])

In [5]:
# Choosing features and Target for training and testing
X = label_df.copy()
y = X.pop('TARGET')

In [6]:
# Splitting the dataset into train and test
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [7]:
#Fitting Decision Tree classifier to the training set  
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
classifier.fit(xtrain, ytrain)

RandomForestClassifier(criterion='entropy', n_estimators=10)

In [9]:
#Predicting the test set result  
y_pred= classifier.predict(xtest) 

In [11]:
#Creating the Confusion matrix  
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(ytest, y_pred)
cm

array([[6168,    9],
       [  36,   71]], dtype=int64)

In [12]:
# Evaluating the model
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(ytest, y_pred))

Accuracy :  0.9928389560789306


## SMOTE (synthetic minority oversampling technique)

In [13]:
import imblearn

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
from imblearn.pipeline import Pipeline

In [16]:
# define pipeline
over = SMOTE(sampling_strategy=0.1)
steps = [('o', over)]
pipeline = Pipeline(steps=steps)

In [18]:
X1, y1 = pipeline.fit_resample(xtrain, ytrain)
X1_test, y1_test = pipeline.fit_resample(xtest, ytest)

In [19]:
classifier.fit(X1, y1)

RandomForestClassifier(criterion='entropy', n_estimators=10)

In [20]:
y_pred = classifier.predict(X1_test)

In [22]:
#Creating the Confusion matrix  
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y1_test, y_pred)
cm

array([[6163,   14],
       [ 245,  372]], dtype=int64)

In [23]:
# Evaluating the model
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y1_test, y_pred))

Accuracy :  0.9618781277597881


## Hyper Parameter Tuning

## Tune Sklearn

In [32]:
# from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV
tune_search = TuneGridSearchCV(
    RandomForestClassifier(),
    random_grid,
    early_stopping=True,
    max_iters=10
)
import time # Just to compare fit times
start = time.time()
tune_search.fit(X1, y1)
end = time.time()
print("Tune Fit Time:", end - start)
pred = tune_search.predict(X1_test)
accuracy = np.count_nonzero(np.array(pred) == np.array(y1_test)) / len(pred)
print("Tune Accuracy:", accuracy)
print(tune_search.best_params_)

  "The `loggers` argument is deprecated. Please pass the respective "
Log sync requires rsync to be installed.


Tune Fit Time: 225.13130068778992
Tune Accuracy: 0.958934353841625
{'max_features': 'auto', 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False, 'n_estimators': 10}
