#### Random Forest Model 
- Uses ensemble of decision trees.
- Breaking whole bunch of decision trees and putting them together.
- Increases prediction accuracy  
PRO   
- Reduces over fitting - which may not be representative of true population
- Reduces Bias - i.e. not evenly split in training.

Need parameters:
- Node Size
- Number of Trees 
- Number of Features 

The number of estimators - number of decision trees used to build the ensemble model. 

In [1]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, average_precision_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

%matplotlib inline


In [2]:
########
## READ IN CSV fraud_det_dig_df
# fraud_df = pd.read_csv("../Resources/fraud_det_dig_df.csv")


## Create additional step if SAMPLING only.
## READ IN CSV fraud_det_dig_df
fraud_det_df = pd.read_csv("../Resources/fraud_det_dig_df.csv")
fraud_det_df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,hour,day,week,trans_weight,bal_change_per,large_transaction,type_CASH_OUT,type_TRANSFER,Destination
0,1,181.0,181.0,0.0,0.0,0.0,1,0,1,1,1,1.0,-1.0,0,0,1,0
1,1,181.0,181.0,0.0,21182.0,0.0,1,0,1,1,1,1.0,-1.0,0,1,0,1
2,1,229133.94,15325.0,0.0,5083.0,51513.44,0,0,1,1,1,14.95,-1.0,1,1,0,2
3,1,215310.3,705.0,0.0,22425.0,0.0,0,0,1,1,1,305.4,-1.0,1,0,1,3
4,1,311685.89,10835.0,0.0,6267.0,2719172.89,0,0,1,1,1,28.77,-1.0,1,0,1,4


In [3]:
## Create Sample of data to work with
# Determine the proportions of 'isFraud' values in the DataFrame
fraud_proportions = fraud_det_df['isFraud'].value_counts(normalize=True)

# Calculate the number of samples needed for each 'isFraud' value
sample_size = 100000
sample_per_is_fraud = (fraud_proportions * sample_size).astype(int)

# Use the 'groupby' function to take a proportional sample
fraud_df = fraud_det_df.groupby('isFraud').apply(lambda x: x.sample(sample_per_is_fraud[x.name]))

# Reset the index of the sampled DataFrame
fraud_df.reset_index(drop=True, inplace=True)
fraud_df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,hour,day,week,trans_weight,bal_change_per,large_transaction,type_CASH_OUT,type_TRANSFER,Destination
0,474,353251.41,353251.41,0.0,1324847.31,1678098.72,0,0,18,20,3,1.0,-1.0,1,1,0,85600
1,256,112975.44,300985.0,188009.56,766780.34,879755.78,0,0,16,11,2,0.38,-0.38,1,1,0,25403
2,188,1505375.2,1505375.2,0.0,2864483.55,4369858.75,0,0,20,8,2,1.0,-1.0,1,0,1,70907
3,325,475800.17,475800.17,0.0,1201430.73,1677230.9,0,0,13,14,2,1.0,-1.0,1,0,1,154497
4,208,32941.11,157182.0,124240.89,3049310.56,3350234.31,0,0,16,9,2,0.21,-0.21,0,1,0,133171


In [4]:
# Define target vector
# use reshape to convert y into 2-dimensional column vector for machine learning.
y = fraud_df['isFraud'].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [5]:
# Define features Set
X = fraud_df.copy()
X.drop('isFraud', axis=1, inplace =True)
X.head()


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,hour,day,week,trans_weight,bal_change_per,large_transaction,type_CASH_OUT,type_TRANSFER,Destination
0,474,353251.41,353251.41,0.0,1324847.31,1678098.72,0,18,20,3,1.0,-1.0,1,1,0,85600
1,256,112975.44,300985.0,188009.56,766780.34,879755.78,0,16,11,2,0.38,-0.38,1,1,0,25403
2,188,1505375.2,1505375.2,0.0,2864483.55,4369858.75,0,20,8,2,1.0,-1.0,1,0,1,70907
3,325,475800.17,475800.17,0.0,1201430.73,1677230.9,0,13,14,2,1.0,-1.0,1,0,1,154497
4,208,32941.11,157182.0,124240.89,3049310.56,3350234.31,0,16,9,2,0.21,-0.21,0,1,0,133171


In [6]:
# Perform train-test split on the original DataFrame
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
##########Scaling Data before underbalancing applied
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

------------------------------------------
### Fitting the Random Forest Model
- Model 1
-----------------------------------------------


In [8]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [9]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

KeyboardInterrupt: 

Making Preductions Using Random Forest Model

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

Model1 Evaluation 

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

#### Feature Importance

In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

In [None]:
# Create a DataFrame with feature importances
importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})

# Sort the DataFrame by importance values in ascending order to rank features from top to bottom
importances_sorted = importances_df.sort_values(by='Importance')

# Plot the feature importances with dark red color scheme
plt.figure(figsize=(10, 6))
plt.barh(importances_sorted['Feature'], importances_sorted['Importance'], color='darkred')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
