In [98]:
## Use Random Forest to prepare a model on fraud data 
## treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [99]:
## Import Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [37]:
#import Dataset
df = pd.read_csv('/content/Fraud_check.csv')

In [38]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [39]:
#Creating dummy vairables for ['Undergrad','Marital.Status','Urban'] dropping first dummy variable
Df=pd.get_dummies(df,columns=['Undergrad','Marital.Status','Urban'],drop_first=True)

In [40]:
Df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,68833,50047,10,0,0,1,1
1,33700,134075,18,1,0,0,1
2,36925,160205,30,0,1,0,1
3,50190,193264,15,1,0,1,1
4,81002,27533,28,0,1,0,0
...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1
596,69967,55369,2,1,0,0,1
597,47334,154058,0,0,0,0,1
598,98592,180083,17,1,1,0,0


In [41]:
Df['Taxable.Income']

0      68833
1      33700
2      36925
3      50190
4      81002
       ...  
595    76340
596    69967
597    47334
598    98592
599    96519
Name: Taxable.Income, Length: 600, dtype: int64

In [42]:
#Creating new cols TaxInc and dividing 'Taxable.Income' cols on the basis of [10002,30000,99620] for Risky and Good
Df["TaxInc"] = pd.cut(df["Taxable.Income"], bins = [10002,30000,99620], labels = ["Risky", "Good"])

In [43]:
Df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,TaxInc
0,68833,50047,10,0,0,1,1,Good
1,33700,134075,18,1,0,0,1,Good
2,36925,160205,30,0,1,0,1,Good
3,50190,193264,15,1,0,1,1,Good
4,81002,27533,28,0,1,0,0,Good
...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,Good
596,69967,55369,2,1,0,0,1,Good
597,47334,154058,0,0,0,0,1,Good
598,98592,180083,17,1,1,0,0,Good


In [44]:
Df['TaxInc'].value_counts()

Good     476
Risky    124
Name: TaxInc, dtype: int64

In [45]:
labelencoder = LabelEncoder()
Df.iloc[:,-1] = labelencoder.fit_transform(Df.iloc[:,-1])

In [46]:
Df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,TaxInc
0,68833,50047,10,0,0,1,1,0
1,33700,134075,18,1,0,0,1,0
2,36925,160205,30,0,1,0,1,0
3,50190,193264,15,1,0,1,1,0
4,81002,27533,28,0,1,0,0,0
...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,0
596,69967,55369,2,1,0,0,1,0
597,47334,154058,0,0,0,0,1,0
598,98592,180083,17,1,1,0,0,0


In [47]:
Df['TaxInc'].value_counts()
## '0' = 'Good'
## '1' = 'Risky'
## after lable encoder

0    476
1    124
Name: TaxInc, dtype: int64

In [48]:
DF=Df.rename({'Marital.Status_Married': 'MaritalStatus_Married','Marital.Status_Single':'MaritalStatus_Single','Taxable.Income':'TaxableIncome','City.Population':'CityPopulation','Work.Experience':'WorkExperience'}, axis=1)

In [49]:
DF

Unnamed: 0,TaxableIncome,CityPopulation,WorkExperience,Undergrad_YES,MaritalStatus_Married,MaritalStatus_Single,Urban_YES,TaxInc
0,68833,50047,10,0,0,1,1,0
1,33700,134075,18,1,0,0,1,0
2,36925,160205,30,0,1,0,1,0
3,50190,193264,15,1,0,1,1,0
4,81002,27533,28,0,1,0,0,0
...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,0
596,69967,55369,2,1,0,0,1,0
597,47334,154058,0,0,0,0,1,0
598,98592,180083,17,1,1,0,0,0


In [69]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   TaxableIncome          600 non-null    int64  
 1   CityPopulation         600 non-null    int64  
 2   WorkExperience         600 non-null    int64  
 3   Undergrad_YES          600 non-null    uint8  
 4   MaritalStatus_Married  600 non-null    uint8  
 5   MaritalStatus_Single   600 non-null    uint8  
 6   Urban_YES              600 non-null    uint8  
 7   TaxInc                 600 non-null    int64  
 8   scores                 600 non-null    float64
 9   anomaly                600 non-null    int64  
dtypes: float64(1), int64(5), uint8(4)
memory usage: 30.6 KB


# Outlier detiction by isolation forest

In [50]:
from sklearn.ensemble import IsolationForest

In [51]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(DF)

IsolationForest(behaviour='deprecated', bootstrap=False, contamination=0.01,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=10, verbose=0, warm_start=False)

In [52]:
# predictions
y_pred_outliers = clf.predict(DF)

In [53]:
y_pred_outliers 

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [54]:
DF['scores']=clf.decision_function(DF)

In [55]:
DF

Unnamed: 0,TaxableIncome,CityPopulation,WorkExperience,Undergrad_YES,MaritalStatus_Married,MaritalStatus_Single,Urban_YES,TaxInc,scores
0,68833,50047,10,0,0,1,1,0,0.113209
1,33700,134075,18,1,0,0,1,0,0.103882
2,36925,160205,30,0,1,0,1,0,0.027727
3,50190,193264,15,1,0,1,1,0,0.089827
4,81002,27533,28,0,1,0,0,0,0.047417
...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,0,0.093915
596,69967,55369,2,1,0,0,1,0,0.083323
597,47334,154058,0,0,0,0,1,0,0.066569
598,98592,180083,17,1,1,0,0,0,0.060353


In [56]:
DF['anomaly']=clf.predict(DF.iloc[:,0:8])
## calculating anomaly socre

In [57]:
DF

Unnamed: 0,TaxableIncome,CityPopulation,WorkExperience,Undergrad_YES,MaritalStatus_Married,MaritalStatus_Single,Urban_YES,TaxInc,scores,anomaly
0,68833,50047,10,0,0,1,1,0,0.113209,1
1,33700,134075,18,1,0,0,1,0,0.103882,1
2,36925,160205,30,0,1,0,1,0,0.027727,1
3,50190,193264,15,1,0,1,1,0,0.089827,1
4,81002,27533,28,0,1,0,0,0,0.047417,1
...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,0,0.093915,1
596,69967,55369,2,1,0,0,1,0,0.083323,1
597,47334,154058,0,0,0,0,1,0,0.066569,1
598,98592,180083,17,1,1,0,0,0,0.060353,1


In [62]:
#Print the outlier data points
DF[DF['anomaly']==-1]

Unnamed: 0,TaxableIncome,CityPopulation,WorkExperience,Undergrad_YES,MaritalStatus_Married,MaritalStatus_Single,Urban_YES,TaxInc,scores,anomaly
34,12514,183767,1,1,0,0,1,1,-0.00058,-1
36,10163,193995,5,1,1,0,1,1,-0.007265,-1
196,10933,28410,21,1,1,0,0,1,-0.000543,-1
457,11804,36055,24,0,1,0,1,1,-0.002076,-1
517,19272,195078,26,0,0,1,0,1,-0.004114,-1
541,12453,191874,30,1,0,0,1,1,-0.004776,-1


In [77]:
DF_new=DF.drop([34,36,196,457,517,541], axis=0,inplace=False)
## Here i had dropped multiple rows outlier from the data set

In [78]:
DF_new

Unnamed: 0,TaxableIncome,CityPopulation,WorkExperience,Undergrad_YES,MaritalStatus_Married,MaritalStatus_Single,Urban_YES,TaxInc,scores,anomaly
0,68833,50047,10,0,0,1,1,0,0.113209,1
1,33700,134075,18,1,0,0,1,0,0.103882,1
2,36925,160205,30,0,1,0,1,0,0.027727,1
3,50190,193264,15,1,0,1,1,0,0.089827,1
4,81002,27533,28,0,1,0,0,0,0.047417,1
...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,0,0.093915,1
596,69967,55369,2,1,0,0,1,0,0.083323,1
597,47334,154058,0,0,0,0,1,0,0.066569,1
598,98592,180083,17,1,1,0,0,0,0.060353,1


# Randomforest Model

In [80]:
X = DF_new.iloc[:,0:7]
Y =DF_new.iloc[:,7]

In [93]:
kfold = KFold(n_splits=6, random_state=10)
model = RandomForestClassifier(n_estimators=10, max_features=3)
results = cross_val_score(model, X, Y, cv=kfold)



In [94]:
print(results.mean())

0.9983164983164983


In [97]:
results.mean()*100.0
## Accuracy

99.83164983164983

In [95]:
results.std()*100.0

0.3764424204545084