In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("C:\DataScience\Assignment\Random Forest\Fraud_check.csv")
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


## Data Insights

In [3]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [5]:
df.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

## Label Encoder

In [6]:
le = LabelEncoder()

df['Undergrad'] = le.fit_transform(df['Undergrad'])
df['Marital.Status'] = le.fit_transform(df['Marital.Status'])
df['Urban'] = le.fit_transform(df['Urban'])

df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0
...,...,...,...,...,...,...
595,1,0,76340,39492,7,1
596,1,0,69967,55369,2,1
597,0,0,47334,154058,0,1
598,1,1,98592,180083,17,0


## Target Column

In [7]:
Fraud = []
for value in df['Taxable.Income']:
    if value < 30000:
        Fraud.append('Risky')
    else:
        Fraud.append('Good')

df['Status'] = Fraud

## Encoding the target column

In [8]:
df['Status'] = le.fit_transform(df['Status'])

In [9]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Status
0,0,2,68833,50047,10,1,0
1,1,0,33700,134075,18,1,0
2,0,1,36925,160205,30,1,0
3,1,2,50190,193264,15,1,0
4,0,1,81002,27533,28,0,0
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,0
596,1,0,69967,55369,2,1,0
597,0,0,47334,154058,0,1,0
598,1,1,98592,180083,17,0,0


In [10]:
x = df.drop(columns=['Status', 'Taxable.Income'])
y = df.Status

## Bagged Decision Trees for Classification

In [11]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score

In [12]:
kfold1 = KFold(n_splits=10, shuffle = True, random_state = 8)
model1 = BaggingClassifier(base_estimator= DecisionTreeClassifier(), n_estimators = 300, random_state = 8)

In [13]:
results = cross_val_score(model1, x,y, cv = kfold1)
print(results.mean()) 

0.7416666666666667


In [14]:
model1.fit(x, y)
feature_importances = np.mean([
    tree.feature_importances_ for tree in model1.estimators_
], axis=0)
feature_importances

array([0.04852903, 0.08534138, 0.56780812, 0.24377001, 0.05455147])

In [15]:
data1 = { 'Feature' : ['Undergrad', 'Marital.Status', 'City.Population', 'Work.Experience',
       'Urban'],
        'Importance' : [0.04852903, 0.08534138, 0.56780812, 0.24377001, 0.05455147]
}
pd.DataFrame(data1)

Unnamed: 0,Feature,Importance
0,Undergrad,0.048529
1,Marital.Status,0.085341
2,City.Population,0.567808
3,Work.Experience,0.24377
4,Urban,0.054551


## Random Forest Classification

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
kfold2 = KFold(n_splits=10, shuffle = True, random_state = 8)
model2 = RandomForestClassifier(max_features=5, n_estimators = 300)

In [18]:
results = cross_val_score(model2, x,y, cv = kfold2)
print(results.mean()) 

0.74


In [19]:
model2.fit(x, y)
model2.feature_importances_

array([0.05048852, 0.08466661, 0.5693907 , 0.24454463, 0.05090955])

In [20]:
data2 = { 'Feature' : ['Undergrad', 'Marital.Status', 'City.Population', 'Work.Experience',
       'Urban'],
        'Importance' : [0.05158477, 0.08371616, 0.57232822, 0.24211369, 0.05025715]
}
pd.DataFrame(data2)

Unnamed: 0,Feature,Importance
0,Undergrad,0.051585
1,Marital.Status,0.083716
2,City.Population,0.572328
3,Work.Experience,0.242114
4,Urban,0.050257


## Boost Classification

In [21]:
from sklearn.ensemble import AdaBoostClassifier

In [22]:
kfold3 = KFold(n_splits=10, shuffle = True, random_state = 8)
model3 = AdaBoostClassifier(n_estimators = 300)

In [23]:
results = cross_val_score(model3, x,y, cv = kfold3)
print(results.mean()) 

0.75


In [24]:
model3.fit(x, y)
model3.feature_importances_

array([0.00333333, 0.01      , 0.88333333, 0.10333333, 0.        ])

In [25]:
data3 = { 'Feature' : ['Undergrad', 'Marital.Status', 'City.Population', 'Work.Experience',
       'Urban'],
        'Importance' :[0.00333333, 0.01      , 0.88333333, 0.10333333, 0.        ]
}
pd.DataFrame(data3)

Unnamed: 0,Feature,Importance
0,Undergrad,0.003333
1,Marital.Status,0.01
2,City.Population,0.883333
3,Work.Experience,0.103333
4,Urban,0.0


## Stacking Classification

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [27]:
kfold4 = KFold(n_splits = 10,shuffle = True, random_state = 8)
estimators = []

### creating the sub models

In [28]:
sub1 = LogisticRegression()
estimators.append(('logistic', sub1))

In [29]:
sub2 = SVC()
estimators.append(('SVC', sub2))

In [30]:
sub3 = DecisionTreeClassifier()
estimators.append(('DT', sub3))

### Creating the ensemble model

In [31]:
ensemble = VotingClassifier(estimators)

In [32]:
results = cross_val_score(ensemble, x, y, cv = kfold4)
print(results.mean())

0.7933333333333333


### Conclusion:

a) Model Accuracy for Decision Tree Classifier : 0.7416

b) Model Accuracy for Random Forest Classifier : 0.7366

c) Model Accuracy for Ada Boost Classifier : 0.75

d) Model Accuracy for Stacking Ensemble : 0.7933

City.Population column is the most important feature according to RF model

## End