# Ensemple models


In [1]:
pip install ucimlrepo




In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
# print(adult.metadata)

# variable information
print(adult.variables)



              name     role         type      demographic  \
0              age  Feature      Integer              Age   
1        workclass  Feature  Categorical           Income   
2           fnlwgt  Feature      Integer             None   
3        education  Feature  Categorical  Education Level   
4    education-num  Feature      Integer  Education Level   
5   marital-status  Feature  Categorical            Other   
6       occupation  Feature  Categorical            Other   
7     relationship  Feature  Categorical            Other   
8             race  Feature  Categorical             Race   
9              sex  Feature       Binary              Sex   
10    capital-gain  Feature      Integer             None   
11    capital-loss  Feature      Integer             None   
12  hours-per-week  Feature      Integer             None   
13  native-country  Feature  Categorical            Other   
14          income   Target       Binary           Income   

                       

In [3]:
X.isna().sum()

Unnamed: 0,0
age,0
workclass,963
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,966
relationship,0
race,0
sex,0


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


## Data cleaning

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

In [6]:
df = pd.concat([X, y], axis=1)

In [7]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [8]:
df.duplicated().sum()

29

In [9]:
df.drop_duplicates(inplace = True)

In [10]:
len(df['native-country'].unique())

43

In [11]:
df.drop(columns = ['fnlwgt', 'education', 'native-country', 'capital-loss', 'capital-gain'], inplace = True)

In [12]:
df

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,<=50K
...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,13,Divorced,Prof-specialty,Not-in-family,White,Female,36,<=50K.
48838,64,,9,Widowed,,Other-relative,Black,Male,40,<=50K.
48839,38,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,50,<=50K.
48840,44,Private,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,40,<=50K.


In [13]:
df.isna().sum().sum()

1929

In [14]:
df.ffill(inplace=True)

In [15]:
df.isna().sum().sum()

0

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48813 entries, 0 to 48841
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48813 non-null  int64 
 1   workclass       48813 non-null  object
 2   education-num   48813 non-null  int64 
 3   marital-status  48813 non-null  object
 4   occupation      48813 non-null  object
 5   relationship    48813 non-null  object
 6   race            48813 non-null  object
 7   sex             48813 non-null  object
 8   hours-per-week  48813 non-null  int64 
 9   income          48813 non-null  object
dtypes: int64(3), object(7)
memory usage: 4.1+ MB


## Data prep.

In [17]:
df['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24698
<=50K.,12430
>50K,7839
>50K.,3846


In [18]:
df['income'].replace({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['income'].replace({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}, inplace=True)
  df['income'].replace({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}, inplace=True)


Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,0
...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,13,Divorced,Prof-specialty,Not-in-family,White,Female,36,0
48838,64,Private,9,Widowed,Prof-specialty,Other-relative,Black,Male,40,0
48839,38,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,50,0
48840,44,Private,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,40,0


In [19]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int8)
scaler = StandardScaler()

ct = ColumnTransformer([
    ('o_enc', ohe, ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex']),
    ('standard', scaler, ['age', 'hours-per-week'])
], remainder='passthrough').set_output(transform='pandas')

first_opt = ct.fit_transform(df)

In [20]:
first_opt

Unnamed: 0,o_enc__workclass_Federal-gov,o_enc__workclass_Local-gov,o_enc__workclass_Never-worked,o_enc__workclass_Private,o_enc__workclass_Self-emp-inc,o_enc__workclass_Self-emp-not-inc,o_enc__workclass_State-gov,o_enc__workclass_Without-pay,o_enc__marital-status_Married-AF-spouse,o_enc__marital-status_Married-civ-spouse,...,o_enc__relationship_Wife,o_enc__race_Asian-Pac-Islander,o_enc__race_Black,o_enc__race_Other,o_enc__race_White,o_enc__sex_Male,standard__age,standard__hours-per-week,remainder__education-num,remainder__income
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,0.025724,-0.034304,13,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,1,0.828125,-2.213335,13,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,-0.047221,-0.034304,9,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,1,1.046961,-0.034304,7,0
4,0,0,0,1,0,0,0,0,0,1,...,1,0,1,0,0,0,-0.776676,-0.034304,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0.025724,-0.357123,13,0
48838,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,1.849362,-0.034304,9,0
48839,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,1,-0.047221,0.772745,13,0
48840,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0.390452,-0.034304,13,0


In [21]:
df

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,0
...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,13,Divorced,Prof-specialty,Not-in-family,White,Female,36,0
48838,64,Private,9,Widowed,Prof-specialty,Other-relative,Black,Male,40,0
48839,38,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,50,0
48840,44,Private,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,40,0


In [22]:
df['marital-status'].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [23]:
racism = [['Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'],
          ['Black', 'Other', 'Asian-Pac-Islander','Amer-Indian-Eskimo', 'White']]
orde = OrdinalEncoder(categories=racism, dtype=np.int8)

ct = ColumnTransformer([
    ('orde', orde, ['marital-status', 'race']),
    ('o_enc', ohe, ['workclass', 'occupation', 'relationship', 'sex']),
    ('standard', scaler, ['age', 'hours-per-week'])
], remainder='passthrough').set_output(transform='pandas')

second_opt = ct.fit_transform(df)

In [24]:
second_opt

Unnamed: 0,orde__marital-status,orde__race,o_enc__workclass_Federal-gov,o_enc__workclass_Local-gov,o_enc__workclass_Never-worked,o_enc__workclass_Private,o_enc__workclass_Self-emp-inc,o_enc__workclass_Self-emp-not-inc,o_enc__workclass_State-gov,o_enc__workclass_Without-pay,...,o_enc__relationship_Not-in-family,o_enc__relationship_Other-relative,o_enc__relationship_Own-child,o_enc__relationship_Unmarried,o_enc__relationship_Wife,o_enc__sex_Male,standard__age,standard__hours-per-week,remainder__education-num,remainder__income
0,0,4,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0.025724,-0.034304,13,0
1,4,4,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.828125,-2.213335,13,0
2,1,4,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,-0.047221,-0.034304,9,0
3,4,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1.046961,-0.034304,7,0
4,4,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,-0.776676,-0.034304,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,1,4,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0.025724,-0.357123,13,0
48838,3,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,1.849362,-0.034304,9,0
48839,4,4,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,-0.047221,0.772745,13,0
48840,1,2,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0.390452,-0.034304,13,0


## Modeling

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [113]:
X_1 = first_opt.drop(columns = ['remainder__income'])
y_1 = first_opt['remainder__income']

# Racist model
X_2 = second_opt.drop(columns = ['remainder__income'])
y_2 = second_opt['remainder__income']

In [114]:
# Splitting

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

In [28]:
# Models
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

g_nb = GaussianNB()
rf = RandomForestClassifier()
lr = LogisticRegression(max_iter=500)
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier()

In [29]:
# Option 1

dt.fit(X_train_1, y_train_1)
lr.fit(X_train_1, y_train_1)
knn.fit(X_train_1, y_train_1)
rf.fit(X_train_1, y_train_1)
g_nb.fit(X_train_1, y_train_1)

y_pred_1 = dt.predict(X_test_1)
y_pred_2 = lr.predict(X_test_1)
y_pred_3 = knn.predict(X_test_1)
y_pred_4 = rf.predict(X_test_1)
y_pred_5 = g_nb.predict(X_test_1)

In [30]:
# results for all models
print(f'Decision Tree: {accuracy_score(y_test_1, y_pred_1)}')
print(f'Logistic Regression: {accuracy_score(y_test_1, y_pred_2)}')
print(f'KNN: {accuracy_score(y_test_1, y_pred_3)}')
print(f'RF: {accuracy_score(y_test_1, y_pred_4)}')
print(f'G_NB: {accuracy_score(y_test_1, y_pred_5)}')

Decision Tree: 0.7869507323568575
Logistic Regression: 0.8347843900440438
KNN: 0.8190105500358497
RF: 0.8150158762675407
G_NB: 0.6591211717709721


In [31]:
print(classification_report(y_test_1, y_pred_2))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89      7375
           1       0.70      0.57      0.63      2388

    accuracy                           0.83      9763
   macro avg       0.78      0.74      0.76      9763
weighted avg       0.83      0.83      0.83      9763



In [32]:
# Option 2 (Racist)
dt.fit(X_train_2, y_train_2)
lr.fit(X_train_2, y_train_2)
knn.fit(X_train_2, y_train_2)

y_pred_1_2 = dt.predict(X_test_2)
y_pred_2_2 = lr.predict(X_test_2)
y_pred_3_2 = knn.predict(X_test_2)

In [33]:
# results for all models
print(f'Decision Tree: {accuracy_score(y_test_2, y_pred_1_2)}')
print(f'Logistic Regression: {accuracy_score(y_test_2, y_pred_2_2)}')
print(f'KNN: {accuracy_score(y_test_2, y_pred_3_2)}')

Decision Tree: 0.7868483048243368
Logistic Regression: 0.8350916726416061
KNN: 0.8169619993854348


# Ensembling

In [34]:
modellar=[('dt', dt),('lr', lr),('knn', knn), ('rf', rf), ('g_nb', g_nb)]

### Voting


In [35]:
# Hard Voting

hard_voting = VotingClassifier(estimators=modellar, voting='hard')
hard_voting.fit(X_train_1, y_train_1)

In [36]:
y_pred_meta = hard_voting.predict(X_test_1)

In [37]:
print(f'Hard: {accuracy_score(y_test_1, y_pred_meta)}')

Hard: 0.8258731947147393


In [38]:
# Soft Voting
soft_voting = VotingClassifier(estimators=modellar,
                               voting='soft',
                               weights=[0.2, 1.5, 1, 0.8, 0.01])
soft_voting.fit(X_train_1, y_train_1)

In [39]:
y_pred_meta = soft_voting.predict(X_test_1)

In [40]:
print(f'Soft: {accuracy_score(y_test_1, y_pred_meta)}')

Soft: 0.8294581583529653


In [41]:
print(classification_report(y_test_1, y_pred_meta))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      7375
           1       0.68      0.58      0.62      2388

    accuracy                           0.83      9763
   macro avg       0.77      0.75      0.76      9763
weighted avg       0.82      0.83      0.82      9763



### Staking

In [42]:
!pip install xgboost



In [43]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

In [44]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

In [45]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier()


In [46]:
# Stacking

from sklearn.ensemble import StackingClassifier

stacking_cl = StackingClassifier(estimators=modellar, final_estimator=et, cv=3)

In [47]:
print(stacking_cl.named_estimators)

{'dt': DecisionTreeClassifier(), 'lr': LogisticRegression(max_iter=500), 'knn': KNeighborsClassifier(), 'rf': RandomForestClassifier(), 'g_nb': GaussianNB()}


In [48]:
stacking_cl.fit(X_train_1, y_train_1)

y_pred_meta_xgb = stacking_cl.predict(X_test_1)

In [49]:
print(f'Stacking: {accuracy_score(y_test_1, y_pred_meta_xgb)}')

Stacking: 0.8092799344463791


In [50]:
# Classification report

print(classification_report(y_test_1, y_pred_meta_xgb))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      7375
           1       0.63      0.54      0.58      2388

    accuracy                           0.81      9763
   macro avg       0.74      0.72      0.73      9763
weighted avg       0.80      0.81      0.80      9763



In [52]:
# # Level 2 stacking

# lv_2 = StackingClassifier(estimators=[('xgb', xgb),
#                                       ('mlp', mlp),
#                                       ('et', et)],
#                                       final_estimator=lr)

In [53]:
# lv_2.fit(X_train_1, y_train_1)

# y_pred_meta_lv2 = lv_2.predict(X_test_1)

In [54]:
# print(classification_report(y_test_1, y_pred_meta_lv2))

### Blending

In [81]:
# Splitting train set and validation set

X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_train_1, y_train_1, test_size=0.25, random_state=25)


In [56]:
models = [
    ('lr', LogisticRegression(max_iter=500, random_state=25)),
    ('dt', DecisionTreeClassifier(random_state=25)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
]

In [82]:
base_predictions = {}
for name, model in models:
    model.fit(X_train_1, y_train_1)
    base_predictions[name] = model.predict_proba(X_val_1)[:, 1]

In [85]:
meta_features = pd.DataFrame(base_predictions)
print("Meta-features shape:", meta_features.shape)
print(meta_features.head())


Meta-features shape: (8298, 3)
         lr        dt  knn
0  0.002405  0.000000  0.0
1  0.282120  0.000000  0.4
2  0.142824  0.000000  0.0
3  0.201654  0.000000  0.2
4  0.168444  0.428571  0.4


In [91]:
from sklearn.svm import SVC

In [92]:
meta_model = SVC(random_state=25)
meta_model.fit(meta_features, y_val_1)

In [93]:
test_meta_features = {}
for name, model in models:
    test_meta_features[name] = model.predict_proba(X_test_1)[:, 1]

In [94]:
test_meta_df = pd.DataFrame(test_meta_features)

blend_preds = meta_model.predict(test_meta_df)

In [95]:
print(f'Blending: {accuracy_score(y_test_1, blend_preds):.4f}')
print("\nClassification Report for Blending:")
print(classification_report(y_test_1, blend_preds))

Blending: 0.8355

Classification Report for Blending:
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      7375
           1       0.71      0.55      0.62      2388

    accuracy                           0.84      9763
   macro avg       0.79      0.74      0.76      9763
weighted avg       0.83      0.84      0.83      9763



#### Stacking as Blending

In [96]:
stacking_for_blend = StackingClassifier(
    estimators=[m for m in models],
    final_estimator=LogisticRegression(max_iter=500, random_state=25),
    cv=None,
    passthrough=False
)


In [97]:
stacking_for_blend.fit(X_train_1, y_train_1)
y_pred_stack_blend = stacking_for_blend.predict(X_test_1)

In [98]:
print(f'StackingClassifier as Blending: {accuracy_score(y_test_1, y_pred_stack_blend):.4f}')
print("\nClassification Report for StackingClassifier as Blending:")
print(classification_report(y_test_1, y_pred_stack_blend))

StackingClassifier as Blending: 0.8357

Classification Report for StackingClassifier as Blending:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      7375
           1       0.71      0.56      0.63      2388

    accuracy                           0.84      9763
   macro avg       0.79      0.74      0.76      9763
weighted avg       0.83      0.84      0.83      9763



### Bagging

In [115]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(random_state=25)

In [119]:
from sklearn.ensemble import BaggingClassifier
bag_dt = BaggingClassifier(estimator=LogisticRegression(max_iter=500, random_state=25), bootstrap=True, random_state=25)

In [120]:
bag_dt.fit(X_train_1, y_train_1)
y_pred_bag_dt = bag_dt.predict(X_test_1)

In [121]:
print(f'Bagging with Decision Tree: {accuracy_score(y_test_1, y_pred_bag_dt):.4f}')
print("\nClassification Report for Bagging with Decision Tree:")
print(classification_report(y_test_1, y_pred_bag_dt))

Bagging with Decision Tree: 0.8354

Classification Report for Bagging with Decision Tree:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      7375
           1       0.70      0.57      0.63      2388

    accuracy                           0.84      9763
   macro avg       0.79      0.75      0.76      9763
weighted avg       0.83      0.84      0.83      9763



In [122]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=25)
rf.fit(X_train_1, y_train_1)
y_pred_rf = rf.predict(X_test_1)

In [None]:
# Parameter tuning for random forest
from sklearn.model_selection import GridSearchCV

rf_param = {'n_estimators': [100,150,200,500],
            'max_depth': [5, 10,15, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]}

g_cv = GridSearchCV(estimator=rf, param_grid=rf_param, cv=4, verbose=2)
g_cv.fit(X_train_1, y_train_1)

Fitting 4 folds for each of 144 candidates, totalling 576 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time=   1.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time=   1.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time=   1.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time=   1.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.7s
[CV] END max_depth=5, min_samples_lea

In [None]:
print(grid.best_params_)
grid_predictions = grid.predict(X_test)

In [None]:
print("\nClassification Report for Gradient Search of Random Forest")
print(classification_report(y_test_1, grid_predictions))

In [123]:
print(f'Random Forest (Bagging of RF): {accuracy_score(y_test_1, y_pred_rf):.4f}')
print("\nClassification Report for Bagging as Random forest:")
print(classification_report(y_test_1, y_pred_rf))

Random Forest (Bagging of RF): 0.8157

Classification Report for Bagging as Random forest:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      7375
           1       0.64      0.56      0.60      2388

    accuracy                           0.82      9763
   macro avg       0.75      0.73      0.74      9763
weighted avg       0.81      0.82      0.81      9763



### Boosting
