In [1]:
import pandas as pd 
import numpy as np
import matplotlib as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

### Reading data set

In [2]:
data=pd.read_csv('Fraud_check.csv')
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


### Getting dummies of columns

In [4]:
data2=pd.get_dummies(data,columns=['Undergrad','Urban'])
data2.head()

Unnamed: 0,Marital.Status,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Urban_NO,Urban_YES
0,Single,68833,50047,10,1,0,0,1
1,Divorced,33700,134075,18,0,1,0,1
2,Married,36925,160205,30,1,0,0,1
3,Single,50190,193264,15,0,1,0,1
4,Married,81002,27533,28,1,0,1,0


In [5]:
data3=pd.get_dummies(data2,columns=['Marital.Status'])
data3.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Urban_NO,Urban_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single
0,68833,50047,10,1,0,0,1,0,0,1
1,33700,134075,18,0,1,0,1,1,0,0
2,36925,160205,30,1,0,0,1,0,1,0
3,50190,193264,15,0,1,0,1,0,0,1
4,81002,27533,28,1,0,1,0,0,1,0


### Setting if income <= 3000 : Risky,if income>3000 : Good

In [6]:
def f(income):
    if income<=3000:
        return 'Risky'
    elif income>3000:
        return 'Good'
    else:
        pass

In [7]:
d=data3['Taxable.Income'].apply(f)

In [8]:
data3['Taxable.Income']=d

In [9]:
income_df=data3
income_df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Urban_NO,Urban_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single
0,Good,50047,10,1,0,0,1,0,0,1
1,Good,134075,18,0,1,0,1,1,0,0
2,Good,160205,30,1,0,0,1,0,1,0
3,Good,193264,15,0,1,0,1,0,0,1
4,Good,27533,28,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,Good,39492,7,0,1,0,1,1,0,0
596,Good,55369,2,0,1,0,1,1,0,0
597,Good,154058,0,1,0,0,1,1,0,0
598,Good,180083,17,0,1,1,0,0,1,0


In [10]:
data4=pd.get_dummies(data3,columns=['Taxable.Income'])
data4.head()

Unnamed: 0,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Urban_NO,Urban_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Taxable.Income_Good
0,50047,10,1,0,0,1,0,0,1,1
1,134075,18,0,1,0,1,1,0,0,1
2,160205,30,1,0,0,1,0,1,0,1
3,193264,15,0,1,0,1,0,0,1,1
4,27533,28,1,0,1,0,0,1,0,1


In [11]:
x=data4.iloc[:,0:7]
y=data4.iloc[:,7]

### Splitting into training and testing data

In [12]:
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.3,random_state=42)

In [13]:
print(x_train.shape,y_train.shape, x_test.shape,y_test.shape)

(420, 7) (420,) (180, 7) (180,)


# Random forest

In [47]:
num_trees = 100
max_features = 3
kfold = KFold(n_splits=11)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
model.fit(x_train , y_train)

RandomForestClassifier(max_features=3)

In [48]:

scores=cross_val_score(model,x_train,y_train,cv=kfold)
print("Mean cross-validation score: {:.2f}".format(np.mean(scores)))
print("Standard deviation: {:.2f}".format(np.std(scores)))

Mean cross-validation score: 0.67
Standard deviation: 0.06


In [49]:
model.fit(x_train,y_train)

RandomForestClassifier(max_features=3)

In [50]:
preds = model.predict(x_test) 
pd.Series(preds).value_counts() 

0    134
1     46
dtype: int64

In [51]:
pd.crosstab(y_test,preds)

col_0,0,1
Marital.Status_Married,Unnamed: 1_level_1,Unnamed: 2_level_1
0,95,25
1,39,21


In [52]:
np.mean(preds==y_test)

0.6444444444444445