# Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
data=pd.read_csv("Fraud_check.csv")
print(data.sample(6))
print(data.shape)

    Undergrad Marital.Status  Taxable.Income  City.Population  \
24         NO       Divorced           38239            28495   
253       YES         Single           79869            77256   
427        NO       Divorced           76700            99025   
211       YES        Married           70812            57194   
587        NO         Single           87685           154677   
1         YES       Divorced           33700           134075   

     Work.Experience Urban  
24                30    NO  
253               29    NO  
427               10    NO  
211                8   YES  
587               26   YES  
1                 18   YES  
(600, 6)


In [3]:
data=data.rename(columns={'Marital.Status':'Marital_Status','Taxable.Income':'Income','City.Population':'City_Population',
                         'Work.Experience':'Work_Experience'})
#=======================================================================
Status=[]
for i in data.Income:
    if i<=30000:
        Status.append("Risky")
    else:
        Status.append("Good")
data['Status']=Status
#========================================================================
encoder=LabelEncoder()
data['Undergrad']=encoder.fit_transform(data['Undergrad'])
data['Urban']=encoder.fit_transform(data['Urban'])
data['Marital_Status']=encoder.fit_transform(data['Marital_Status'])
data.drop(['Income'],axis=1,inplace=True)
x=data.iloc[:,:-1]
y=data['Status']

In [4]:
x

Unnamed: 0,Undergrad,Marital_Status,City_Population,Work_Experience,Urban
0,0,2,50047,10,1
1,1,0,134075,18,1
2,0,1,160205,30,1
3,1,2,193264,15,1
4,0,1,27533,28,0
...,...,...,...,...,...
595,1,0,39492,7,1
596,1,0,55369,2,1
597,0,0,154058,0,1
598,1,1,180083,17,0


In [6]:
y=encoder.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=12)

In [26]:
model=RandomForestClassifier()
Parameters={
    'n_estimators':[10,20,30,40,50,70,100],
    'max_features':[3,4,5]
    
}

In [27]:
gscv=GridSearchCV(model,Parameters,cv=5)
gscv.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': [3, 4, 5],
                         'n_estimators': [10, 20, 30, 40, 50, 70, 100]})

In [28]:
gscv.best_params_

{'max_features': 4, 'n_estimators': 20}

In [29]:
gscv.best_score_

0.7261904761904763

In [21]:
final_model=RandomForestClassifier(max_features=5, n_estimators= 50)
final_model.fit(x_train,y_train)
print(classification_report(y_test,final_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       151
           1       0.12      0.10      0.11        29

    accuracy                           0.74       180
   macro avg       0.48      0.48      0.48       180
weighted avg       0.72      0.74      0.73       180

