Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [3]:
data=pd.read_csv("/content/Fraud_check.csv")
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [4]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Taxable.Income,600.0,55208.375,26204.827597,10003.0,32871.5,55074.5,78611.75,99619.0
City.Population,600.0,108747.368333,49850.075134,25779.0,66966.75,106493.5,150114.25,199778.0
Work.Experience,600.0,15.558333,8.842147,0.0,8.0,15.0,24.0,30.0


In [5]:
data.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [6]:
data.duplicated().sum()

0

In [7]:
data.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

In [8]:
df=data.copy()

In [9]:
df=pd.get_dummies(df,columns=['Undergrad','Marital.Status','Urban'])
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1
2,36925,160205,30,1,0,0,1,0,0,1
3,50190,193264,15,0,1,0,0,1,0,1
4,81002,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,0,1,1,0,0,0,1
596,69967,55369,2,0,1,1,0,0,0,1
597,47334,154058,0,1,0,1,0,0,0,1
598,98592,180083,17,0,1,0,1,0,1,0


In [10]:
df.corr()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
Taxable.Income,1.0,-0.064387,-0.001818,0.049232,-0.049232,0.062099,-0.100611,0.03791,-0.070628,0.070628
City.Population,-0.064387,1.0,0.013135,-0.079579,0.079579,0.004925,-0.007035,0.002087,0.040098,-0.040098
Work.Experience,-0.001818,0.013135,1.0,0.005362,-0.005362,-0.014833,0.033347,-0.018123,0.094559,-0.094559
Undergrad_NO,0.049232,-0.079579,0.005362,1.0,-1.0,0.045101,-0.01512,-0.028883,-0.000267,0.000267
Undergrad_YES,-0.049232,0.079579,-0.005362,-1.0,1.0,-0.045101,0.01512,0.028883,0.000267,-0.000267
Marital.Status_Divorced,0.062099,0.004925,-0.014833,0.045101,-0.045101,1.0,-0.468757,-0.510435,0.065518,-0.065518
Marital.Status_Married,-0.100611,-0.007035,0.033347,-0.01512,0.01512,-0.468757,1.0,-0.520317,-0.016771,0.016771
Marital.Status_Single,0.03791,0.002087,-0.018123,-0.028883,0.028883,-0.510435,-0.520317,1.0,-0.047014,0.047014
Urban_NO,-0.070628,0.040098,0.094559,-0.000267,0.000267,0.065518,-0.016771,-0.047014,1.0,-1.0
Urban_YES,0.070628,-0.040098,-0.094559,0.000267,-0.000267,-0.065518,0.016771,0.047014,-1.0,1.0


In [11]:
df["Taxable.Income"]=df["Taxable.Income"].apply(lambda x: "Risky" if x<=30000 else "Good")

Random Forest Model

In [12]:
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [14]:
y_train.value_counts()

Good     380
Risky    100
Name: Taxable.Income, dtype: int64

In [15]:
RFM=RandomForestClassifier(n_estimators = 150, oob_score =True,criterion ='entropy')

In [20]:
params={"n_jobs":np.array(range(1,8))}
params

{'n_jobs': array([1, 2, 3, 4, 5, 6, 7])}

In [17]:
model=GridSearchCV(RFM,param_grid=params)

In [18]:
model.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='entropy',
                                              max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=150, n_jobs=None,
                                              oob_score=True, 

In [22]:
model.best_score_

0.7666666666666667

In [23]:
model.best_params_

{'n_jobs': 1}

In [24]:
RFMF=RandomForestClassifier(n_jobs=1,n_estimators = 150, oob_score =True,criterion ='entropy')

In [26]:
model1=RFMF.fit(X_train,y_train)

In [27]:
pred_train=model1.predict(X_train)

In [28]:
accuracy_score(pred_train,y_train)

1.0

In [29]:
y_pred=model1.predict(X_test)

In [30]:
accuracy_score(y_test,y_pred)

0.775

In [31]:
confusion_matrix(y_test,y_pred)

array([[93,  3],
       [24,  0]])

Conclusion: As seen in the confusion matrix of Test data 93 instances are presdected correctly and 27 instances are not