### 1. Import libraries 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

### 2.Loading Dataset

In [3]:
fraud_data = pd.read_csv('Fraud_check.csv')
fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


### 3. Data understanding

In [4]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [5]:
##Converting the Taxable income variable to bucketing.
fraud_data["income"]="<=30000"
fraud_data.loc[fraud_data["Taxable.Income"]>=30000,"income"]="Good"
fraud_data.loc[fraud_data["Taxable.Income"]<=30000,"income"]="Risky"

In [6]:
##Droping the Taxable income variable
fraud_data.drop(["Taxable.Income"],axis=1,inplace=True)

In [7]:
fraud_data.head(50)

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,income
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good
5,NO,Divorced,116382,0,NO,Good
6,NO,Divorced,80890,8,YES,Good
7,YES,Single,131253,3,YES,Good
8,NO,Single,102481,12,YES,Good
9,YES,Divorced,155482,4,YES,Good


### 4. Data preprocessing

In [8]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for column_name in fraud_data.columns:
    if fraud_data[column_name].dtype == object:
        fraud_data[column_name] = le.fit_transform(fraud_data[column_name])
    else:
        pass

In [9]:
##Splitting the data into featuers and labels
x = fraud_data.iloc[:,0:5]
y = fraud_data.iloc[:,5]

In [10]:
## Collecting the column names
colnames = list(fraud_data.columns)
predictors = colnames[0:5]
target = colnames[5]

In [13]:
##Splitting the data into train and test

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,stratify = y)

In [14]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((480, 5), (120, 5), (480,), (120,))

###  5. Model Building

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_jobs=3,n_estimators=15,oob_score=True,criterion='entropy') 
rf_model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=15, n_jobs=3,
                       oob_score=True)

In [16]:
rf_model.estimators_

[DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=871257870),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=930301530),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=418730774),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=541016205),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1084625176),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1258122501),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1789001215),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1811106299),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
          

In [17]:
rf_model.classes_

array([0, 1])

In [18]:
rf_model.n_features_

5

In [19]:
rf_model.n_classes_

2

In [20]:
rf_model.n_outputs_

1

In [21]:
rf_model.oob_score_

0.7375

### 6.Model Training

In [22]:
y_pred_train = rf_model.predict(x_train)

###### Accuracy & Confusion Matrix

In [23]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [28]:
#accuracy
train_accuracy=accuracy_score(y_train,y_pred_train)
train_accuracy

0.9854166666666667

In [29]:
#confusion matrix
train_conf_matrix=confusion_matrix(y_train,y_pred_train)
train_conf_matrix

array([[381,   0],
       [  7,  92]], dtype=int64)

### 7. Model Testing

In [30]:
y_pred_test = rf_model.predict(x_test)

###### Accuracy & Confusion Matrix

In [31]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [32]:
#accuracy
test_accuracy=accuracy_score(y_test,y_pred_test)
test_accuracy

0.7

In [34]:
#confusion matrix
test_conf_matrix=confusion_matrix(y_test,y_pred_test)
test_conf_matrix

array([[84, 11],
       [25,  0]], dtype=int64)