## Random Forest Classification

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import KFold,train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, datasets
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('Fraud_check1.csv')

In [3]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [4]:
label_encoder = preprocessing.LabelEncoder()
df["Undergrad"] = label_encoder.fit_transform(df["Undergrad"])
df["Marital.Status"] = label_encoder.fit_transform(df["Marital.Status"])
df["Urban"] = label_encoder.fit_transform(df["Urban"])

In [5]:
# Risky=0  , Good=1
df['Taxable.Income'] = df['Taxable.Income'].apply(lambda x: 'Risky' if x>30000  else 'Good')
df['Taxable.Income'] = df['Taxable.Income'].apply(lambda x: 0 if x== 'Risky'  else 1)
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,0,50047,10,1
1,1,0,0,134075,18,1
2,0,1,0,160205,30,1
3,1,2,0,193264,15,1
4,0,1,0,27533,28,0
...,...,...,...,...,...,...
595,1,0,0,39492,7,1
596,1,0,0,55369,2,1
597,0,0,0,154058,0,1
598,1,1,0,180083,17,0


In [6]:
x = df.drop('Taxable.Income',
               axis = 1)
y = df['Taxable.Income']

In [7]:
x

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,0,2,50047,10,1
1,1,0,134075,18,1
2,0,1,160205,30,1
3,1,2,193264,15,1
4,0,1,27533,28,0
...,...,...,...,...,...
595,1,0,39492,7,1
596,1,0,55369,2,1
597,0,0,154058,0,1
598,1,1,180083,17,0


In [8]:
y

0      0
1      0
2      0
3      0
4      0
      ..
595    0
596    0
597    0
598    0
599    0
Name: Taxable.Income, Length: 600, dtype: int64

In [9]:
# Splitting data into training and testing data set
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.3,random_state=40)

## Standardize the Data

In [10]:
scaler = StandardScaler()

In [11]:
col = x_train.columns
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [12]:
x_train = pd.DataFrame(x_train, columns=col)
x_test = pd.DataFrame(x_test, columns=col)

In [13]:
x_train.shape

(420, 5)

In [14]:
x_test.shape

(180, 5)

## Random Forest Classifier

In [15]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=30, max_features=3)
rf_model.fit(x_train,y_train)

RandomForestClassifier(max_depth=30, max_features=3)

In [16]:
pred_test = rf_model.predict(x_test)

In [17]:
rep = classification_report(y_test,pred_test)
print(rep)

              precision    recall  f1-score   support

           0       0.76      0.93      0.84       140
           1       0.00      0.00      0.00        40

    accuracy                           0.72       180
   macro avg       0.38      0.46      0.42       180
weighted avg       0.59      0.72      0.65       180



In [18]:
rf_model_actual_predict = pd.DataFrame({'Actual': y_test, 'Predict': pred_test})

In [19]:
rf_model_actual_predict.head()

Unnamed: 0,Actual,Predict
159,0,0
307,0,0
253,0,0
528,0,0
137,1,0


In [20]:
rf_model.score(x_test, y_test)*100

72.22222222222221

## Bagging Classifier

In [21]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

cart = DecisionTreeClassifier(max_depth=10)
model = BaggingClassifier(base_estimator=cart, n_estimators=100, random_state=7)

In [22]:
model.fit(x_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=10),
                  n_estimators=100, random_state=7)

In [23]:
bagg_pred_test = model.predict(x_test)

In [24]:
rep = classification_report(y_test,bagg_pred_test)
print(rep)

              precision    recall  f1-score   support

           0       0.77      0.96      0.86       140
           1       0.00      0.00      0.00        40

    accuracy                           0.75       180
   macro avg       0.39      0.48      0.43       180
weighted avg       0.60      0.75      0.67       180



## ADABOOST Classifier

In [25]:
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier(n_estimators=10, random_state=7)
ada_model.fit(x_train,y_train)
ada_pred_test = ada_model.predict(x_test)

In [26]:
rep = classification_report(y_test,ada_pred_test)
print(rep)

              precision    recall  f1-score   support

           0       0.79      0.99      0.88       140
           1       0.67      0.05      0.09        40

    accuracy                           0.78       180
   macro avg       0.73      0.52      0.48       180
weighted avg       0.76      0.78      0.70       180



## Stacking Classifier

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# create the sub models
estimators = []

model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

model3 = SVC()
estimators.append(('svm', model3))

In [28]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('cart', DecisionTreeClassifier()),
 ('svm', SVC())]

In [29]:
# create the ensemble model
ensemble = VotingClassifier(estimators)

ensemble.fit(x_train,y_train)
stack_pred_test = ensemble.predict(x_test)

In [30]:
rep = classification_report(y_test,stack_pred_test)
print(rep)

              precision    recall  f1-score   support

           0       0.78      1.00      0.88       140
           1       0.00      0.00      0.00        40

    accuracy                           0.78       180
   macro avg       0.39      0.50      0.44       180
weighted avg       0.60      0.78      0.68       180



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
