In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [2]:
# importing data
fraud_df = pd.read_csv('Fraud_check.csv')
fraud_df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
# Creating dummy variables for ['Undergrad','Marital.Status','Urban']
df = pd.get_dummies(fraud_df, columns=['Undergrad','Marital.Status','Urban'], drop_first=True)

In [4]:
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,68833,50047,10,0,0,1,1
1,33700,134075,18,1,0,0,1
2,36925,160205,30,0,1,0,1
3,50190,193264,15,1,0,1,1
4,81002,27533,28,0,1,0,0
...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1
596,69967,55369,2,1,0,0,1
597,47334,154058,0,0,0,0,1
598,98592,180083,17,1,1,0,0


In [5]:
df["Tax"] = pd.cut(df["Taxable.Income"], bins = [10002,30000,99620], labels = ["Risky", "Good"])

In [6]:

df.head(20)

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,Tax
0,68833,50047,10,0,0,1,1,Good
1,33700,134075,18,1,0,0,1,Good
2,36925,160205,30,0,1,0,1,Good
3,50190,193264,15,1,0,1,1,Good
4,81002,27533,28,0,1,0,0,Good
5,33329,116382,0,0,0,0,0,Good
6,83357,80890,8,0,0,0,1,Good
7,62774,131253,3,1,0,1,1,Good
8,83519,102481,12,0,0,1,1,Good
9,98152,155482,4,1,0,0,1,Good


In [7]:
df.dropna(inplace=True)
df.shape

(600, 8)

In [8]:
#Spliting the data
x= df.iloc[:,1:7]
y=df.iloc[:,7]

In [9]:
x

Unnamed: 0,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,50047,10,0,0,1,1
1,134075,18,1,0,0,1
2,160205,30,0,1,0,1
3,193264,15,1,0,1,1
4,27533,28,0,1,0,0
...,...,...,...,...,...,...
595,39492,7,1,0,0,1
596,55369,2,1,0,0,1
597,154058,0,0,0,0,1
598,180083,17,1,1,0,0


In [10]:
y

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Tax, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [11]:
# Splitting the data for training and testing
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=35)


### Building Decision Tree Classifier using Entropy Criteria

In [12]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=20)
model.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=20)

In [13]:
# tree.plot_tree(model);

In [14]:
# Predicting on test
preds = model.predict(x_test)
pd.Series(preds).value_counts()

Good     90
Risky    30
dtype: int64

In [15]:
preds

array(['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Risky', 'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Risky', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good',
       'Risky', 'Risky', 'Risky', 'Risky', 'Good', 'Good', 'Risky',
       'Risky', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Good', 'Ris

In [16]:
pd.crosstab(y_test,preds)

col_0,Good,Risky
Tax,Unnamed: 1_level_1,Unnamed: 2_level_1
Risky,19,5
Good,71,25


In [17]:
# Accuracy
np.mean(preds==y_test)

0.6333333333333333

### Building Decision Tree Classifier (CART) using Gini Criteria

In [18]:
model_g = DecisionTreeClassifier(criterion='gini', max_depth=20)

In [19]:
model_g.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=20)

In [20]:
# Prediction and computing the accuracy
pred_g = model_g.predict(x_test)
pd.crosstab(y_test,pred_g)

col_0,Good,Risky
Tax,Unnamed: 1_level_1,Unnamed: 2_level_1
Risky,19,5
Good,71,25


In [21]:
#Accuracy
np.mean(pred_g==y_test)

0.6333333333333333