### Fraud Check Random Forest

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("Fraud_check.csv")
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
df.shape

(600, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [5]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [8]:
#Checking CoRelation  Pearson
df.corr()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
Taxable.Income,1.0,-0.064387,-0.001818
City.Population,-0.064387,1.0,0.013135
Work.Experience,-0.001818,0.013135,1.0


In [9]:
df[df.corr()>80].sum() # No High Correlation Found between Features

Undergrad            0
Marital.Status       0
Taxable.Income     0.0
City.Population    0.0
Work.Experience    0.0
Urban                0
dtype: object

In [10]:
#Checking CoRelation using Spearman Rank 
df.corr(method ='spearman')

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
Taxable.Income,1.0,-0.065913,-0.004529
City.Population,-0.065913,1.0,0.00916
Work.Experience,-0.004529,0.00916,1.0


In [11]:
df[df.corr(method ='spearman')>80].sum() # No Severe correation Found

Undergrad            0
Marital.Status       0
Taxable.Income     0.0
City.Population    0.0
Work.Experience    0.0
Urban                0
dtype: object

In [12]:
df["Taxable.Income"].min()

10003

In [13]:
df["Taxable.Income"].max()

99619

In [14]:
# Converting Taget columns from Regression To Categorical as mentioned
bins=[10003,30000,99619]

taxable_conversion=["Risky","Good"]

df["Cat_Taxable_inc"]=pd.cut(df["Taxable.Income"],bins,labels=taxable_conversion)
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Cat_Taxable_inc
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
...,...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES,Good
596,YES,Divorced,69967,55369,2,YES,Good
597,NO,Divorced,47334,154058,0,YES,Good
598,YES,Married,98592,180083,17,NO,Good


In [15]:
df=df.drop("Taxable.Income",axis=1)

In [16]:
df

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Cat_Taxable_inc
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good
...,...,...,...,...,...,...
595,YES,Divorced,39492,7,YES,Good
596,YES,Divorced,55369,2,YES,Good
597,NO,Divorced,154058,0,YES,Good
598,YES,Married,180083,17,NO,Good


In [17]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,LabelEncoder
oe = OrdinalEncoder()
df.select_dtypes(object).columns
obj_enc=df.select_dtypes(object).columns
obj_enc


Index(['Undergrad', 'Marital.Status', 'Urban'], dtype='object')

In [18]:

oe=OrdinalEncoder()
df[obj_enc]=oe.fit_transform(df[obj_enc])
df


Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Cat_Taxable_inc
0,0.0,2.0,50047,10,1.0,Good
1,1.0,0.0,134075,18,1.0,Good
2,0.0,1.0,160205,30,1.0,Good
3,1.0,2.0,193264,15,1.0,Good
4,0.0,1.0,27533,28,0.0,Good
...,...,...,...,...,...,...
595,1.0,0.0,39492,7,1.0,Good
596,1.0,0.0,55369,2,1.0,Good
597,0.0,0.0,154058,0,1.0,Good
598,1.0,1.0,180083,17,0.0,Good


In [19]:
le=LabelEncoder()
df["Cat_Taxable_inc"]=le.fit_transform(df["Cat_Taxable_inc"])

In [20]:
df

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Cat_Taxable_inc
0,0.0,2.0,50047,10,1.0,0
1,1.0,0.0,134075,18,1.0,0
2,0.0,1.0,160205,30,1.0,0
3,1.0,2.0,193264,15,1.0,0
4,0.0,1.0,27533,28,0.0,0
...,...,...,...,...,...,...
595,1.0,0.0,39492,7,1.0,0
596,1.0,0.0,55369,2,1.0,0
597,0.0,0.0,154058,0,1.0,0
598,1.0,1.0,180083,17,0.0,0


In [21]:
from sklearn.ensemble import IsolationForest
iso=IsolationForest(random_state=10,contamination=0.03)
out=iso.fit_predict(df)



In [22]:
df["outliers"]=out
df["outliers"].value_counts()

 1    582
-1     18
Name: outliers, dtype: int64

In [23]:
#Dropping The ouliers and selecting the normal data
df=df[df["outliers"]== 1]
df

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Cat_Taxable_inc,outliers
0,0.0,2.0,50047,10,1.0,0,1
1,1.0,0.0,134075,18,1.0,0,1
2,0.0,1.0,160205,30,1.0,0,1
3,1.0,2.0,193264,15,1.0,0,1
4,0.0,1.0,27533,28,0.0,0,1
...,...,...,...,...,...,...,...
595,1.0,0.0,39492,7,1.0,0,1
596,1.0,0.0,55369,2,1.0,0,1
597,0.0,0.0,154058,0,1.0,0,1
598,1.0,1.0,180083,17,0.0,0,1


In [24]:
df=df.drop(["outliers"],axis=1)
df

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Cat_Taxable_inc
0,0.0,2.0,50047,10,1.0,0
1,1.0,0.0,134075,18,1.0,0
2,0.0,1.0,160205,30,1.0,0
3,1.0,2.0,193264,15,1.0,0
4,0.0,1.0,27533,28,0.0,0
...,...,...,...,...,...,...
595,1.0,0.0,39492,7,1.0,0
596,1.0,0.0,55369,2,1.0,0
597,0.0,0.0,154058,0,1.0,0
598,1.0,1.0,180083,17,0.0,0


In [25]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [26]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30, random_state=42)
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape


((407, 5), (175, 5), (407,), (175,))

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

rfc=RandomForestClassifier()
rfc.fit(xtrain,ytrain)
ypred = rfc.predict(xtest)
  
    
print(classification_report(ytest,ypred))
print(rfc.score(xtrain,ytrain))
print(rfc.score(xtest,ytest))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88       142
           1       0.00      0.00      0.00        33

    accuracy                           0.78       175
   macro avg       0.40      0.48      0.44       175
weighted avg       0.65      0.78      0.71       175

1.0
0.7828571428571428


In [28]:
print(confusion_matrix(ytest,ypred))

[[137   5]
 [ 33   0]]


In [29]:
param_grid = {
 'n_estimators': [25, 50, 100, 150,200,250,300,350],
 'max_features': ['sqrt', 'log2', None],
 'max_depth': [3, 6, 9],
 'max_leaf_nodes': [3, 6, 9],
}


In [30]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(RandomForestClassifier(),
param_grid=param_grid,cv=10)

grid_search.fit(xtrain,ytrain)
print(grid_search.best_estimator_)



RandomForestClassifier(max_depth=3, max_features='sqrt', max_leaf_nodes=3,
                       n_estimators=25)


In [31]:
rfc=RandomForestClassifier(max_depth=9, max_features=None, max_leaf_nodes=6,
                       n_estimators=25)
rfc.fit(xtrain,ytrain)
ypred = rfc.predict(xtest)
  
    
print(classification_report(ytest,ypred))
print(rfc.score(xtrain,ytrain))
print(rfc.score(xtest,ytest))

              precision    recall  f1-score   support

           0       0.81      1.00      0.90       142
           1       0.00      0.00      0.00        33

    accuracy                           0.81       175
   macro avg       0.41      0.50      0.45       175
weighted avg       0.66      0.81      0.73       175

0.8206388206388207
0.8114285714285714


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
