In [2]:
#To plot all the diagrams within the notebook
%matplotlib inline

In [3]:
import numpy as np


In [4]:
import pandas as pd

#calculate accuracy measures and confusion matrix
from sklearn import metrics

In [5]:
from sklearn.tree import DecisionTreeClassifier

In [6]:
credit_df=pd.read_csv("D:\Machine Learning\Machine Learning Projects\Ensemble-Bagging(Decision Tree)\credit.csv")

In [7]:
credit_df.info()#info-it gives the structure of the data frame

#many columns are of the type object,i.e. strings. These need to be converted to a numerics

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

In [8]:
# Converting the "object" or "string" datatypes to a numeric or categorical datatypes

# Decision tree in python can take only nemerical/categorical columns .It cannot take string/object
# The following code loops through each column and checks if the column type is object, then converts those...
#... into categorical with each distinct value becoming a category or code.

for feature in credit_df.columns: # loop through all columns in the data frame
    if credit_df[feature].dtype=='object': # only apply for columns with categorical strings
        credit_df[feature]= pd.Categorical(credit_df[feature]).codes # Replace strings with an integers

In [9]:
#again checking the structure of the data frame
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   checking_balance      1000 non-null   int8 
 1   months_loan_duration  1000 non-null   int64
 2   credit_history        1000 non-null   int8 
 3   purpose               1000 non-null   int8 
 4   amount                1000 non-null   int64
 5   savings_balance       1000 non-null   int8 
 6   employment_duration   1000 non-null   int8 
 7   percent_of_income     1000 non-null   int64
 8   years_at_residence    1000 non-null   int64
 9   age                   1000 non-null   int64
 10  other_credit          1000 non-null   int8 
 11  housing               1000 non-null   int8 
 12  existing_loans_count  1000 non-null   int64
 13  job                   1000 non-null   int8 
 14  dependents            1000 non-null   int64
 15  phone                 1000 non-null   int8 
 16  default

In [10]:
#Comparing the ensemble score or performance with a simple one instance algorithm

#splitting data into training and test set for independent attributes

#**always use random function to split the data into training and test set**

train_set=credit_df.head(700) # upto the last initial training set row
test_set=credit_df.tail(300) # past the last initial training set row

#capture the target column ("default") into separate vectors for training set and test set

train_labels=train_set.pop("default")
test_labels=test_set.pop("default")



In [11]:
#invoking the decision tree classifier function.Using 'entropy' method of finding the split columns
#can use gini index also. 
#dt_model =DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=100)

dt_model=DecisionTreeClassifier(criterion='entropy') #since we didn't use any regularization parameters like max_depth here,this decision tree will going to be 'overfit'.


In [12]:
#fit function
dt_model.fit(train_set,train_labels)

DecisionTreeClassifier(criterion='entropy')

In [13]:
dt_model.score(test_set, test_labels)

0.67

In [14]:
dt_model.score(train_set,train_labels) # overfit due to large complexe tree

1.0

In [15]:
# Lets use ENSEMBLE TECHNIQUE- BAGGING -To improve the model

credit_labels=credit_df.pop("default") # for ensemble you dont need training & test set
                                       # Bagging can use out of bag records for testing
    
    

In [16]:
# In the following lines,we call the bagging classifier with oob_score(out of bag_score),set to True ,which was False by default
# This makes the baggingclassifier use the 37% unused data for testing
#compare the performance of the Bagging classifier with the regularized decision tree above.
#Though not required, you can keep separate test data (outside the bootstrap sampling)on which we test the Bagging Classifier. 


from sklearn.ensemble import BaggingClassifier
bgcl=BaggingClassifier(n_estimators=50 ,max_samples=.8,oob_score=True) # n_estimators=50(no reason)
# max_samples=0.8--> of the 100% records in the original data frame,use only 80% to create your data sets for the ensemble
# oob_score = True--> for ruuning the ensemble against the out of bag data set and see the performance 

bgcl=bgcl.fit(credit_df, credit_labels)
print("Ensemble_score_for_OutOfBag_datasets =" ,bgcl.oob_score_) #.oob_score_--> function which returns the result of testing your ensemble against the out of bag data sets
                                                                       


Ensemble_score_for_OutOfBag_datasets = 0.745


In [17]:
#comparing the score of the ensemble for the collection of 50 instances & the score of one instance ,i.e (0.74 & 0.693)
#(0.74 > 0.693)-so ,there is a significant improvement in the ensemble compared to the one instance of a very large Deci tre.

In [18]:
# Lets REGULARAIZE the decision tree and check the performance

dt_regularaized=DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5)
#min_samples_leaf=5--> at leaf level node ,there should be 5 records or more(not less)

dt_regularaized.fit(train_set,train_labels)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5)

In [19]:
print("training_score = ", dt_regularaized.score(train_set,train_labels))
print("test_score = ", dt_regularaized.score(test_set,test_labels)) #Relatively less overfit as training and test error are similar.

training_score =  0.7828571428571428
test_score =  0.7233333333333334


In [20]:
# here the score for test data (0.723) is lesser than the score for ensemble out of bag data sets(0.74)
# so ensemble gave better performance

In [21]:
#ENSEMBLE-BOOSTING

In [22]:
#ENSEMBLE LEARNING --ADA BOOSTING

In [23]:
from sklearn.ensemble import AdaBoostClassifier
abcl= AdaBoostClassifier(base_estimator = dt_model , n_estimators = 50)#base_estimator = any classification algorithms
                                                                                         #default= decision tree
#abcl= AdaBoostClassifier(n_estimators=50)
abcl.fit(train_set,train_labels)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'))

In [24]:
test_pred = abcl.predict(test_set) # ***Every records in the test set will go through all the 50 trees and predict its own....
                                   #...classification and we will take its majority vote***

abcl.score(test_set , test_labels)


0.66

In [25]:
#ENSEMBLE LEARNING --GRADIENT BOOSTING

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

#gbcl=GradientBoostingClassifier(n_estimators = 50 ,learning_rate = 0.09 , max_depth = 5)
gbcl=GradientBoostingClassifier(n_estimators = 50)

gbcl.fit(train_set,train_labels)

GradientBoostingClassifier(n_estimators=50)

In [27]:
test_pred = gbcl.predict(test_set)
gbcl.score(test_set,test_labels)

0.7566666666666667

In [28]:
#ENSEMBLE-RANDOMFOREST CLASSIFIER


In [35]:
from sklearn.ensemble import RandomForestClassifier
rfcl=RandomForestClassifier(n_estimators=6) #n_estimators(no.of tree)=6(different trees) --all this 6 trees will do different kind of errors 
rfcl=rfcl.fit(train_set,train_labels)

#shift but.+ 2*tab but.--> to see the various parameters that are used

In [36]:
test_pred=rfcl.predict(test_set)
rfcl.score(test_set,test_labels)

0.7133333333333334

In [None]:
#*** Ensemble models is a black box technique because we dont know internally how to interpret the model***.
#*** Simple models is a white box technique***.