## Use Random Forest to prepare a model on fraud data
## Treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
# import the required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
# Read the data
data = pd.read_csv("D:\Assignment\Assignments-15\Fraud_check (1).csv")
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [3]:
# Underlying information of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [4]:
# Converting the categorical columns into numeric columns using Label encoding
cols = ['Undergrad','Marital.Status', 'Urban']

# Encode labels of above columns
data[cols] = data[cols].apply(LabelEncoder().fit_transform)

data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


In [5]:
# Statistics of the data
data.describe()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
count,600.0,600.0,600.0,600.0,600.0,600.0
mean,0.52,1.046667,55208.375,108747.368333,15.558333,0.503333
std,0.500017,0.821958,26204.827597,49850.075134,8.842147,0.500406
min,0.0,0.0,10003.0,25779.0,0.0,0.0
25%,0.0,0.0,32871.5,66966.75,8.0,0.0
50%,1.0,1.0,55074.5,106493.5,15.0,1.0
75%,1.0,2.0,78611.75,150114.25,24.0,1.0
max,1.0,2.0,99619.0,199778.0,30.0,1.0


In [6]:
# Converting the Taxable.Income column into categorical based on the taxable_income if <= 30000 as "Risky" and others are "Good"
TI_cat = []
for value in data['Taxable.Income']:
    if value <= 30000:
        TI_cat.append('Risky')
    else:
        TI_cat.append('Good')
data['TI_cat'] = TI_cat
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,TI_cat
0,0,2,68833,50047,10,1,Good
1,1,0,33700,134075,18,1,Good
2,0,1,36925,160205,30,1,Good
3,1,2,50190,193264,15,1,Good
4,0,1,81002,27533,28,0,Good


In [7]:
# Deviding the data into input X
X = data.drop(['Taxable.Income','TI_cat'], axis=1)
X.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,0,2,50047,10,1
1,1,0,134075,18,1
2,0,1,160205,30,1
3,1,2,193264,15,1
4,0,1,27533,28,0


In [8]:
# Deviding the data into output y
y = data['TI_cat']
y.head()

0    Good
1    Good
2    Good
3    Good
4    Good
Name: TI_cat, dtype: object

In [9]:
#Unique values in result
y.unique()

array(['Good', 'Risky'], dtype=object)

In [10]:
#Count of output values
y.value_counts()

Good     476
Risky    124
Name: TI_cat, dtype: int64

In [11]:
#Splitting the data into train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 1)

## Building a Random Forest Classifier model

In [12]:
#Building the Random Forest model
model = RandomForestClassifier(n_estimators=100, max_features=3, criterion='entropy', random_state=1)
model.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', max_features=3, random_state=1)

In [13]:
#Predictions based on the trained model
pred = model.predict(X_test)
pd.Series(pred).value_counts()

Good     116
Risky      4
dtype: int64

In [14]:
# Cross validation
pd.crosstab(y_test,pred)

col_0,Good,Risky
TI_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
Good,93,4
Risky,23,0


In [15]:
#Accuracy
np.mean(pred == y_test)

0.775

In [16]:
#Features with importance based on above model
model.feature_importances_

array([0.0467966 , 0.08258957, 0.51951769, 0.30409506, 0.04700108])

In [17]:
#Creating a series out of features score
fn = ['Undergrad', 'Marital.Status', 'City.Population',
       'Work.Experience', 'Urban']
F_RN= pd.Series(model.feature_importances_,index=fn).sort_values(ascending=False)
F_RN

City.Population    0.519518
Work.Experience    0.304095
Marital.Status     0.082590
Urban              0.047001
Undergrad          0.046797
dtype: float64

## Selcting the best features of created model using feature selection algorithm- SelectFromModel

In [19]:
# Creating a model 
model_sel = SelectFromModel(model)
model_sel.fit(X_train,y_train)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 max_features=3,
                                                 random_state=1))

In [20]:
#Useful features
model_sel.get_support()

array([False, False,  True,  True, False])

In [21]:
# Collecting the best featues and count
selected_feat= X_train.columns[(model_sel.get_support())]
len(selected_feat)

2

In [22]:
#Printing the best features found by feature selection model
print(selected_feat)

Index(['City.Population', 'Work.Experience'], dtype='object')


## Conclusion:

## As per the above Random forest and feature selction algorithms following are the key features to determine the fraud
##  .City.Population
## . Work.Experience