In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import metrics

In [2]:
data = pd.read_csv("/content/Fraud_check.csv")

In [3]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [4]:
data.insert(6,"Tax_Condition",'')

In [5]:
for i in range(0, len(data['Taxable.Income'])):
    if data['Taxable.Income'][i] <= 30000:
        data['Tax_Condition'][i] = 'Risky'
    else:
        data['Tax_Condition'][i] = 'Good'

In [6]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Tax_Condition
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


In [7]:
data['Tax_Condition'].value_counts()

Good     476
Risky    124
Name: Tax_Condition, dtype: int64

In [8]:
LE = preprocessing.LabelEncoder()

In [9]:
objlist = ['Undergrad','Marital.Status','Urban','Tax_Condition']
data[objlist] = data[objlist].apply(LE.fit_transform)

In [10]:
data.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban', 'Tax_Condition'],
      dtype='object')

In [11]:
data = data[['Undergrad','Marital.Status','City.Population','Work.Experience','Urban','Tax_Condition']]

In [12]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Tax_Condition
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0


In [13]:
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

In [14]:
#building decision tree using gini

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

In [16]:
model = DecisionTreeClassifier(criterion='gini',min_samples_split=5,max_depth=10)

In [17]:
#checking accuracy for training data
model.fit(X_train,Y_train)
y_pred_train = model.predict(X_train)
metrics.accuracy_score(y_pred_train,Y_train)

0.8761904761904762

In [18]:
#checking accuracy for testing data
model.fit(X_train,Y_train)
y_pred_test = model.predict(X_test)
pd.Series(y_pred_test).value_counts()

0    168
1     12
dtype: int64

In [19]:
metrics.accuracy_score(y_pred_test,Y_test)

0.7333333333333333

In [20]:
print(classification_report(Y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.78      0.92      0.85       142
           1       0.08      0.03      0.04        38

    accuracy                           0.73       180
   macro avg       0.43      0.47      0.44       180
weighted avg       0.63      0.73      0.68       180



In [21]:
#building decision tree using entropy

In [22]:
model1 = DecisionTreeClassifier(criterion='entropy',min_samples_split=5)

In [23]:
#checking accuracy for training data
model1.fit(X_train,Y_train)
y_pred_train1 = model1.predict(X_train)
metrics.accuracy_score(y_pred_train1,Y_train)

0.9380952380952381

In [24]:
#checking accuracy for testing data
model1.fit(X_train,Y_train)
y_pred_test1 = model1.predict(X_test)
metrics.accuracy_score(y_pred_test1,Y_test)

0.6388888888888888

In [32]:
pd.Series(y_pred_test1).value_counts()

0    137
1     43
dtype: int64

In [25]:
print(classification_report(Y_test,y_pred_test1))

              precision    recall  f1-score   support

           0       0.78      0.75      0.77       142
           1       0.19      0.21      0.20        38

    accuracy                           0.64       180
   macro avg       0.48      0.48      0.48       180
weighted avg       0.66      0.64      0.65       180



In [26]:
#bagging

In [27]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

num_trees = 100
model2 = BaggingClassifier(max_samples=0.8, n_estimators=num_trees,random_state=8)



In [28]:
#checking accuracy for training data
model2.fit(X_train,Y_train)
y_pred_train2 = model2.predict(X_train)
metrics.accuracy_score(y_pred_train2,Y_train)

0.9952380952380953

In [30]:
#checking accuracy for testing data
model2.fit(X_train,Y_train)
y_pred_test2 = model2.predict(X_test)
metrics.accuracy_score(y_pred_test2,Y_test)


0.7555555555555555

In [33]:
pd.Series(y_pred_test2).value_counts()

0    170
1     10
dtype: int64

In [31]:
print(classification_report(Y_test,y_pred_test2))

              precision    recall  f1-score   support

           0       0.79      0.94      0.86       142
           1       0.20      0.05      0.08        38

    accuracy                           0.76       180
   macro avg       0.49      0.50      0.47       180
weighted avg       0.66      0.76      0.70       180

