### Problem Statement

Use Random Forest to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
# Importing the rrequired libraries

import pandas as pd
import numpy  as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold, RepeatedStratifiedKFold, RepeatedKFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Fraud_check.csv')
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
# To get the descriptive statistics

df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [4]:
# Getting the information about the data set

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [5]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

No null values are there for any of the features.

In [6]:
# Separate out the independant variables from the dependant variable for One Hot Encoding

X = df.drop('Taxable.Income', axis = 1)
X

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,NO,Single,50047,10,YES
1,YES,Divorced,134075,18,YES
2,NO,Married,160205,30,YES
3,YES,Single,193264,15,YES
4,NO,Married,27533,28,NO
...,...,...,...,...,...
595,YES,Divorced,39492,7,YES
596,YES,Divorced,55369,2,YES
597,NO,Divorced,154058,0,YES
598,YES,Married,180083,17,NO


In [7]:
# Applying One Hot Encoding on the independant variables

X = pd.get_dummies(X)
X

Unnamed: 0,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,50047,10,1,0,0,0,1,0,1
1,134075,18,0,1,1,0,0,0,1
2,160205,30,1,0,0,1,0,0,1
3,193264,15,0,1,0,0,1,0,1
4,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
595,39492,7,0,1,1,0,0,0,1
596,55369,2,0,1,1,0,0,0,1
597,154058,0,1,0,1,0,0,0,1
598,180083,17,0,1,0,1,0,1,0


In [8]:
y = pd.cut(df['Taxable.Income'], bins = [0, 30000, 100000], labels = ['Risky','Good'])
y

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [9]:
y.value_counts()

Good     476
Risky    124
Name: Taxable.Income, dtype: int64

The given dataset is imbalanced one. We need to use sampling technique to handle it.

In [10]:
# Splitting the data into training and testing data sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Building the Random Forest Classifier Model

In [11]:
model_rf = RandomForestClassifier(oob_score = True, random_state = True)
model_rf.fit(X_train,y_train)
y_pred_test = model_rf.predict(X_test)

In [12]:
accuracy_score(y_test,y_pred_test)

0.7444444444444445

In [13]:
confusion_matrix(y_test,y_pred_test)

array([[134,   9],
       [ 37,   0]], dtype=int64)

In [14]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

        Good       0.78      0.94      0.85       143
       Risky       0.00      0.00      0.00        37

    accuracy                           0.74       180
   macro avg       0.39      0.47      0.43       180
weighted avg       0.62      0.74      0.68       180



In [15]:
# Using resampling technique for the imbalanced data set

from imblearn.over_sampling import SMOTE, RandomOverSampler
smt = SMOTE()
X_resampled, y_resampled = smt.fit_resample(X,y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [17]:
model1 = RandomForestClassifier(n_estimators = 150, oob_score = True, random_state = True, criterion= 'entropy')
model1.fit(X_train,y_train)
y_pred_test = model1.predict(X_test)

In [18]:
accuracy_score(y_test,y_pred_test)

0.8111888111888111

In [19]:
confusion_matrix(y_test,y_pred_test)

array([[126,  20],
       [ 34, 106]], dtype=int64)

In [20]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

        Good       0.79      0.86      0.82       146
       Risky       0.84      0.76      0.80       140

    accuracy                           0.81       286
   macro avg       0.81      0.81      0.81       286
weighted avg       0.81      0.81      0.81       286



### Using Cross Validation

In [21]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [22]:
scores = cross_val_score(model1, X_resampled, y_resampled, cv= kfold, scoring="accuracy")
print(scores)

[0.78125    0.84375    0.78947368 0.85263158 0.78947368 0.76842105
 0.82105263 0.86315789 0.81052632 0.83157895]


In [23]:
np.max(scores)

0.8631578947368421

In [24]:
np.min(scores)

0.7684210526315789

In [25]:
np.mean(scores)

0.8151315789473683

In [26]:
# Using Randomover sampler Resampling

resample = RandomOverSampler()
x_resampled, y_resampled = resample.fit_resample(X,y)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [44]:
model1 = RandomForestClassifier(n_estimators = 150, oob_score = True, random_state = True, criterion= 'entropy')
model1.fit(X_train,y_train)
y_pred_test = model1.predict(X_test)

In [45]:
accuracy_score(y_test,y_pred_test)

0.8111888111888111

In [46]:
confusion_matrix(y_test,y_pred_test)

array([[126,  20],
       [ 34, 106]], dtype=int64)

In [47]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

        Good       0.79      0.86      0.82       146
       Risky       0.84      0.76      0.80       140

    accuracy                           0.81       286
   macro avg       0.81      0.81      0.81       286
weighted avg       0.81      0.81      0.81       286

