Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"


In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import KFold,cross_val_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
data_df = pd.read_csv('Fraud_check.csv')
data_df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [4]:
data_df = data_df.rename({'Marital.Status':'Marital_Status','Taxable.Income':'Taxable_Income','City.Population':'City_Population','Work.Experience':'Work_Experience'},axis=1)
data_df.head()
#treating those who have taxable_income <= 30000 as "Risky" and others are "Good"
data_df['TaxIncomeStatus']=''
data_df.loc[data_df.Taxable_Income <=30000,'TaxIncomeStatus']='Risky'
data_df.loc[data_df.Taxable_Income >30000,'TaxIncomeStatus']='Good'
data_df.head()

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban,TaxIncomeStatus
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


In [5]:
label_encoder = LabelEncoder()
data_df['Undergrad_Label']=label_encoder.fit_transform(data_df['Undergrad'])
data_df['Marital_Status_Label']=label_encoder.fit_transform(data_df['Marital_Status'])
data_df['Urban_Label']=label_encoder.fit_transform(data_df['Urban'])
data_df['TaxIncomeStatus_Label']=label_encoder.fit_transform(data_df['TaxIncomeStatus'])
data_df_new = data_df.iloc[:,[3,4,7,8,9,10]]
data_df_new.head()

Unnamed: 0,City_Population,Work_Experience,Undergrad_Label,Marital_Status_Label,Urban_Label,TaxIncomeStatus_Label
0,50047,10,0,2,1,0
1,134075,18,1,0,1,0
2,160205,30,0,1,1,0
3,193264,15,1,2,1,0
4,27533,28,0,1,0,0


In [6]:
#Feature Selection Model Validation

x = data_df_new.iloc[:,:-1]
y = data_df_new.iloc[:,-1]
model =DecisionTreeClassifier(criterion='gini')# as the dependent var is categorical so we can use either c5.0(entropy) or CART(gini impurities)
model.fit(x,y)
rfe = RFE(model)
rfe.fit(x,y)
print("Decession Tree Feature selection \n",model.feature_importances_)
print("Recursive Feature Selection Ranking ", rfe.ranking_,"\n Recursive Feature Selection Top Features ",rfe.support_)

Decession Tree Feature selection 
 [0.59448967 0.18789172 0.06190696 0.09258974 0.06312191]
Recursive Feature Selection Ranking  [1 1 3 2 4] 
 Recursive Feature Selection Top Features  [ True  True False False False]


In [9]:
featured_x = x.iloc[:,[0,1,3]]
X = StandardScaler().fit_transform(featured_x)
# Evaluate using Cross Validation when we have medium no. of dataset
num_folds=10
seed=30
num_trees = 300
kfold = KFold(n_splits=num_folds, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees)
model.fit(X,y)
results = cross_val_score(model, X, y, cv=kfold)
print("Mean---",results.mean(),"\n Std Deviation----",results.std())



Mean--- 0.7183333333333334 
 Std Deviation---- 0.0437480158280223


In [13]:

print("Mean---",results.mean(),"\n Std Deviation----",results.std())
y_pred = model.predict(featured_x)
print(classification_report(y,y_pred))
cnf_matrix = confusion_matrix(y,y_pred)
print(cnf_matrix)

Mean--- 0.7183333333333334 
 Std Deviation---- 0.0437480158280223
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       476
           1       0.00      0.00      0.00       124

    accuracy                           0.79       600
   macro avg       0.40      0.50      0.44       600
weighted avg       0.63      0.79      0.70       600

[[476   0]
 [124   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:

passedTest = data_df.iloc[np.where(y_pred==y)]
print("Total Test length is ",len(featured_x),"\n Passed Test count is ",len(passedTest))
failed = data_df.iloc[np.where(y_pred!=y)]
print("\n"," Failed Test count is",len(failed))

Total Test length is  600 
 Passed Test count is  476

  Failed Test count is 124
