Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Fraud_check.csv')

In [3]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [4]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [5]:
df.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [6]:
df['Risk'] = df['Taxable.Income'].apply(lambda x: 'Risky' if x <= 30000 else 'Good')

In [7]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Risk
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
...,...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES,Good
596,YES,Divorced,69967,55369,2,YES,Good
597,NO,Divorced,47334,154058,0,YES,Good
598,YES,Married,98592,180083,17,NO,Good


In [8]:
df.drop('Taxable.Income', axis=1, inplace=True)

In [9]:
df = pd.get_dummies(df, drop_first=True)

In [10]:
df

Unnamed: 0,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,Risk_Risky
0,50047,10,0,0,1,1,0
1,134075,18,1,0,0,1,0
2,160205,30,0,1,0,1,0
3,193264,15,1,0,1,1,0
4,27533,28,0,1,0,0,0
...,...,...,...,...,...,...,...
595,39492,7,1,0,0,1,0
596,55369,2,1,0,0,1,0
597,154058,0,0,0,0,1,0
598,180083,17,1,1,0,0,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Risk_Risky', axis=1), df['Risk_Risky'], test_size=0.3, random_state=1)

In [12]:
X_train

Unnamed: 0,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
241,31377,18,1,0,1,1
400,197421,16,1,1,0,0
286,113724,6,1,0,0,0
379,112774,13,0,0,1,0
314,85255,4,0,0,1,0
...,...,...,...,...,...,...
129,65469,26,1,0,1,0
144,156503,29,1,0,1,1
72,108300,27,1,0,0,1
235,87541,9,0,0,0,0


In [13]:
X_test

Unnamed: 0,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
446,65708,28,0,0,1,0
404,34334,21,1,0,0,1
509,65531,27,0,1,0,1
455,65533,23,0,1,0,1
201,58225,27,1,0,0,1
...,...,...,...,...,...,...
532,98662,16,1,0,0,0
566,33460,28,0,1,0,0
529,83388,14,0,0,0,0
261,68788,7,0,0,1,1


In [14]:
y_train

241    1
400    0
286    0
379    1
314    0
      ..
129    0
144    0
72     0
235    0
37     0
Name: Risk_Risky, Length: 420, dtype: uint8

In [15]:
y_test

446    1
404    0
509    1
455    0
201    0
      ..
532    0
566    0
529    0
261    0
386    0
Name: Risk_Risky, Length: 180, dtype: uint8

In [16]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

In [17]:
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=1)

In [18]:
y_pred = rfc.predict(X_test)

In [19]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0], dtype=uint8)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89       144
           1       0.00      0.00      0.00        36

    accuracy                           0.80       180
   macro avg       0.40      0.50      0.44       180
weighted avg       0.64      0.80      0.71       180

