# **Random Forest - Fraud Check**

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [34]:
# Read input data from csv file
df_fraud = pd.read_csv('Fraud_check.csv')
df_fraud.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


## **Exploratory Data Analysis (EDA)**

In [35]:
# Rename the columns with dots
df_fraud.rename(columns={'Marital.Status' : 'Marital_Status', 'Taxable.Income': 'Taxable_Income', 
                         'City.Population':'City_Population', 'Work.Experience':'Work_Experience'}, inplace=True)

In [36]:
df_fraud.head()

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [37]:
df_fraud.shape

(600, 6)

In [38]:
df_fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital_Status   600 non-null    object
 2   Taxable_Income   600 non-null    int64 
 3   City_Population  600 non-null    int64 
 4   Work_Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [39]:
# Change object columns to categorical columns
columns = ['Undergrad', 'Marital_Status', 'Urban']

for x in columns:
  df_fraud[x] = df_fraud[x].astype('category')

df_fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Undergrad        600 non-null    category
 1   Marital_Status   600 non-null    category
 2   Taxable_Income   600 non-null    int64   
 3   City_Population  600 non-null    int64   
 4   Work_Experience  600 non-null    int64   
 5   Urban            600 non-null    category
dtypes: category(3), int64(3)
memory usage: 16.3 KB


In [40]:
# Change categorical strings to numerical values
columns = ['Undergrad', 'Marital_Status', 'Urban']

for x in columns:
  df_fraud[x] = df_fraud[x].cat.codes

df_fraud.head()

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


In [41]:
# Create a new column to categorize Taxable_Income as given in approach.
# Categorize with value zero if Taxable_Income <= 30,000 and as 1 if Taxable_Income > 30000
df_fraud['Taxable_Income_Type'] = df_fraud['Taxable_Income']
df_fraud.loc[df_fraud['Taxable_Income'] <= 30000, 'Taxable_Income_Type'] = 0
df_fraud.loc[df_fraud['Taxable_Income'] >  30000, 'Taxable_Income_Type'] = 1

df_fraud.head()

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban,Taxable_Income_Type
0,0,2,68833,50047,10,1,1
1,1,0,33700,134075,18,1,1
2,0,1,36925,160205,30,1,1
3,1,2,50190,193264,15,1,1
4,0,1,81002,27533,28,0,1


In [42]:
df_fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Undergrad            600 non-null    int8 
 1   Marital_Status       600 non-null    int8 
 2   Taxable_Income       600 non-null    int64
 3   City_Population      600 non-null    int64
 4   Work_Experience      600 non-null    int64
 5   Urban                600 non-null    int8 
 6   Taxable_Income_Type  600 non-null    int64
dtypes: int64(4), int8(3)
memory usage: 20.6 KB


In [43]:
# Change output feature to catogorical column
df_fraud['Taxable_Income_Type'] = df_fraud['Taxable_Income_Type'].astype('category')
print(df_fraud.info())
print(df_fraud.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Undergrad            600 non-null    int8    
 1   Marital_Status       600 non-null    int8    
 2   Taxable_Income       600 non-null    int64   
 3   City_Population      600 non-null    int64   
 4   Work_Experience      600 non-null    int64   
 5   Urban                600 non-null    int8    
 6   Taxable_Income_Type  600 non-null    category
dtypes: category(1), int64(3), int8(3)
memory usage: 16.7 KB
None
(600, 7)


In [44]:
# Create separate datasets to have input and output features
x = df_fraud.drop(['Taxable_Income', 'Taxable_Income_Type'], axis='columns')
y = df_fraud['Taxable_Income_Type']

print('x shape: ', x.shape)
print('y shape: ', y.shape)

x shape:  (600, 5)
y shape:  (600,)


In [45]:
# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)

print(x_train.shape)
print(x_test.shape)

print(y_train.shape)
print(y_test.shape)

(420, 5)
(180, 5)
(420,)
(180,)


Observation: Data cleaning completed and train and test data created successfully for model building acitivites. No need to worry about outliers and imbalanced output data since random forest model creates sub samples to be processed by separate Decision Tree classifier trees.

## Random Forest Model Creation

In [46]:
num_trees = 100
max_features = 3

kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = RandomForestClassifier(n_estimators=num_trees, max_features = max_features, criterion = 'gini')
model.fit(x_train, y_train)

RandomForestClassifier(max_features=3)

### Model Prediction and Evaluation

In [47]:
preds = model.predict(x_test)
preds[0:10]

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1])

In [48]:
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
confusion_matrix(y_test, preds)

array([[  1,  34],
       [ 19, 126]])

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.05      0.03      0.04        35
           1       0.79      0.87      0.83       145

    accuracy                           0.71       180
   macro avg       0.42      0.45      0.43       180
weighted avg       0.64      0.71      0.67       180



In [50]:
results = cross_val_score(model, x_test, y_test, cv=kfold)
print(results.mean())

0.75


**Observation: Random forest model got created with 72% accuracy**