### BaggingClassifier

1) A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.

2) Each base classifier is trained in parallel with a training set which is generated by randomly drawing, with replacement, N examples(or data) from the original training dataset – where N is the size of the original training set. Training set for each of the base classifiers is independent of each other. Many of the original data may be repeated in the resulting training set while others may be left out.

3) Bagging reduces overfitting (variance) by averaging or voting, however, this leads to an increase in bias, which is compensated by the reduction in variance though.

In [None]:
import pandas as pd
# pandas is aliased as pd
import numpy as np
# numpy is aliased as np
import matplotlib.pyplot as plt
# pyplot s aliased as plt

In [None]:
df = pd.read_csv('customer_statisfaction_train.csv')
print(type(df))
df.head()  # top 5 rows

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [None]:
df.shape
# rows = 76020, columns = 371

(76020, 371)

#### Handling the Null Values

In [None]:
# nv = null value
nv = df.isnull().sum()
nv = nv[nv>0]
nv

Series([], dtype: int64)

#### Checking the data types

In [None]:
df.dtypes.value_counts()

int64      260
float64    111
dtype: int64

#### Extracting categorical columns

In [None]:
cat_cols = df.dtypes[df.dtypes=='object']
print(cat_cols)

Series([], dtype: object)


In [None]:
df.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

#### Checking the target variable

In [None]:
df['TARGET'].value_counts()

0    73012
1     3008
Name: TARGET, dtype: int64

#### Selecting the independent and the dependent features

In [None]:
x = df.drop('TARGET',axis=1)  # x is independent features
y = df['TARGET']              # y is dependent feature
print(type(x))  # dataframe
print(type(y))  # series
print(x.shape)  
print(y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(76020, 370)
(76020,)


#### Creating the functions to compute Confusion_Matrix, Classification Report and to generate Model Score

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
def eval_model(ytest,ypred):
    cm = confusion_matrix(ytest,ypred)
    print(cm)
    print('Accuracy Score',accuracy_score(ytest,ypred))
    print(classification_report(ytest,ypred))
    
    
def mscore(model,x_train,y_train):
    print('Training Score',model.score(x_train,y_train))

#### Importing the libraries for Model Building

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
dt = DecisionTreeClassifier(criterion='gini',max_depth=10,min_samples_split=20)

#### Building the Model of BaggngClassifier

In [None]:
print(x.shape)

(76020, 370)


#### Hyperparamters of BaggingClassifier
1) base_estimator - The model on which BaggingClassifier will operate upon.<br>
2) n_estimators - number of trees to be used for the BaggingClassifier<br>
3) max_samples - max number of rows to be used for each base estimator<br>
4) max_features - max number of columns to be used for each base estimator<br>
5) random_state - use to initialize the randomization

In [None]:
bc_dt = BaggingClassifier(base_estimator=dt,n_estimators=15,
                          max_samples=x.shape[0],max_features=x.shape[1],
                          random_state=2022)
bc_dt.fit(x,y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=10,
                                                        min_samples_split=20),
                  max_features=370, max_samples=76020, n_estimators=15,
                  random_state=2022)

In [None]:
mscore(bc_dt,x,y)

Training Score 0.9612601946856091


#### Reading the test dataset

In [None]:
test_df = pd.read_csv('customer_statisfaction_test.csv')
test_df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,5,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,6,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,7,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,9,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


In [None]:
test_df.shape
# rows = 75818, columns = 370

(75818, 370)

#### Handling null values in the test dataset

In [None]:
nv_test = test_df.isnull().sum()
nv_test = nv_test[nv_test>0]
nv_test

Series([], dtype: int64)

#### Checking data types of the columns in the test dataset

In [None]:
test_df.dtypes.value_counts()

int64      260
float64    110
dtype: int64

#### Generate the Predictions for the test dataset

In [None]:
y_pred = bc_dt.predict(test_df)
print(y_pred.shape)
print(type(y_pred))  # numpy array

(75818,)
<class 'numpy.ndarray'>


#### Creating a Resultant dataframe for the prediction results

In [None]:
test_df.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var29_ult3', 'saldo_medio_var33_hace2',
       'saldo_medio_var33_hace3', 'saldo_medio_var33_ult1',
       'saldo_medio_var33_ult3', 'saldo_medio_var44_hace2',
       'saldo_medio_var44_hace3', 'saldo_medio_var44_ult1',
       'saldo_medio_var44_ult3', 'var38'],
      dtype='object', length=370)

In [None]:
res = pd.DataFrame({'ID' : test_df['ID'] ,'TARGET' : y_pred})

In [None]:
res.head(10)

Unnamed: 0,ID,TARGET
0,2,0
1,5,0
2,6,0
3,7,0
4,9,0
5,11,0
6,12,0
7,15,0
8,16,0
9,17,0
