#### Random Forest Model Optimised
- Uses ensemble of decision trees.
- Breaking whole bunch of decision trees and putting them together.
- Increases prediction accuracy  
PRO   
- Reduces over fitting - which may not be representative of true population
- Reduces Bias - i.e. not evenly split in training.

The number of estimators - number of decision trees used to build the ensemble model. 
In order for the sampling techniques to work best, you should previously perform any pre-processing steps you can. 

### Part 1.  Import Dependancies 

In [None]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import warnings
import pickle

warnings.filterwarnings("ignore", category=DeprecationWarning)

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, average_precision_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


%matplotlib inline

Read in CSV - Test with Option 2 for Sampling Data

In [None]:
####### OPTION 1 if NOT SAMPLING DATA
#################################################
# READ IN CSV fraud_det_dig_df
fraud_df = pd.read_csv("../Resources/fraud_det_dig_df.csv")

In [None]:
# ######## OPTION 2 for SAMPLING DATA
########################################################
# ## Create additional step if SAMPLING only.
# ## READ IN CSV fraud_det_dig_df
# fraud_det_df = pd.read_csv("../Resources/fraud_det_dig_df.csv")
# fraud_det_df.head()

# ## Create Sample of data to work with
# # Determine the proportions of 'isFraud' values in the DataFrame
# fraud_proportions = fraud_det_df['isFraud'].value_counts(normalize=True)

# # Calculate the number of samples needed for each 'isFraud' value
# sample_size = 100000
# sample_per_is_fraud = (fraud_proportions * sample_size).astype(int)

# # Use the 'groupby' function to take a proportional sample
# fraud_df = fraud_det_df.groupby('isFraud').apply(lambda x: x.sample(sample_per_is_fraud[x.name]))

# # Reset the index of the sampled DataFrame
# fraud_df.reset_index(drop=True, inplace=True)
# fraud_df.head()

### Part 2. Set Up Model Parameters

In [None]:
# Define target vector and features
y = fraud_df['isFraud'].values.reshape(-1, 1)

# iterations of the model identified to drop the following features
X = fraud_df.drop(['isFraud','isFlaggedFraud','large_transaction','newbalanceOrig','week','bal_change_per'], axis=1)


In [None]:
# Perform train-test split on the original DataFrame
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scaling Data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# UNDERSAMPLING using RandomUndersampler
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train.ravel())

In [None]:
# Visualize the distribution of 'isFraud' before balancing
plt.figure(figsize=(5, 4))
ax = sns.countplot(data=fraud_df, x='isFraud', palette={0: 'lightblue', 1: 'darkred'})

for p in ax.patches:
    count = p.get_height()
    ax.annotate(f"{count:.0f}", (p.get_x() + p.get_width() / 2., count),
                ha='center', va='center', fontsize=8, color='black', xytext=(0, 3), textcoords='offset points')

plt.title("Distribution of Fraudulent Transactions (Before Balancing)")
plt.xlabel("isFraud")
plt.ylabel("Count (Millions)")
plt.show()

# Visualize the distribution of 'isFraud' after Random Undersampling
plt.figure(figsize=(5, 4))
ax = sns.countplot(x=y_rus, palette={0: 'lightblue', 1: 'darkred'})

for p in ax.patches:
    count = p.get_height()
    ax.annotate(f"{count:.0f}", (p.get_x() + p.get_width() / 2., count),
                ha='center', va='center', fontsize=8, color='black', xytext=(0, 3), textcoords='offset points')

plt.title("Distribution of Fraudulent Transactions (After Random Undersampling)")
plt.xlabel("isFraud")
plt.ylabel("Count")
plt.show()


------------------------------------------
### Part 3.  Fitting the Random Forest Model
-----------------------------------------------

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

In [None]:
# Fit the model on the resampled data
rf_model.fit(X_rus, y_rus)

Making Preductions Using Random Forest Model

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


Model Evaluation 

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

## create Classification Report as a dataframe
report_dict = classification_report(y_test, predictions, output_dict=True)
classification_report_df = pd.DataFrame(report_dict).transpose()

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("--------------------")
print("Classification Report")
classification_report_df

In [None]:
# Print Model evaluation to CSV for Tableau

cm_df.to_csv('../Resources/confusion_matrix.csv', index=False)
classification_report_df.to_csv('../Resources/classification_report.csv')

#### Feature Importance

In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

In [None]:
# Create a DataFrame with feature importances
importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})

# Sort the DataFrame by importance values in ascending order to rank features from top to bottom
importances_sorted = importances_df.sort_values(by='Importance')

# Plot the feature importances with dark red color scheme
plt.figure(figsize=(10, 6))
plt.barh(importances_sorted['Feature'], importances_sorted['Importance'], color='darkred')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
##Convert importances_sorted to a DataFrame then print for Tableau
importances_df = pd.DataFrame(importances_sorted, columns=['weighting', 'feature'])


# Save the DataFrame to a CSV file
importances_df.to_csv('../Resources/importance_sorted.csv', index=False)

In [None]:
## Save my rf_model as pickle file
model = rf_model

# Specify the filename for the pickle file
filename = 'model.pkl'

# Save the model to the pickle file
with open(filename, 'wb') as file:
    pickle.dump(model, file)


#### Optimisation of Model
Finding the optimal n_estimators

In [None]:
###################################
######
###################################
# Define target vector and features
y = fraud_df['isFraud'].values.reshape(-1, 1)
X = fraud_df.drop('isFraud', axis=1)

# Perform train-test split on the original DataFrame
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling Data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# UNDERSAMPLING using RandomUndersampler
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train.ravel())

# Define a range of values for n_estimators to search over
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500]
}

# Create a random forest classifier
rf_model = RandomForestClassifier(random_state=78)

# Perform GridSearchCV with 5-fold cross-validation to find the best n_estimators value
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_rus, y_rus)

# Get the best n_estimators value from the grid search results
best_n_estimators = grid_search.best_params_['n_estimators']
print("Best n_estimators:", best_n_estimators)

# Train the model with the best n_estimators value on the full training set
best_rf_model = RandomForestClassifier(n_estimators=best_n_estimators, random_state=78)
best_rf_model.fit(X_rus, y_rus)

# Making predictions using the testing data
predictions = best_rf_model.predict(X_test_scaled)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print("Accuracy Score:", acc_score)