<a href="https://colab.research.google.com/github/Siddha-Regilla/Data-Science-Classroom-notebooks/blob/main/17.%20Bagging%2C_Random_Forest%2C_Boosting_%26_Stacking_14_03_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
warnings.filterwarnings('ignore')
from pandas import read_csv
# We need to break our data set into sub-samples. To do that we use Kfolds.
# Kfold means how many folds you want to divide that dataset (k = no. of folds)
from sklearn.model_selection import KFold
# We need to find out the accurarcy of each sub sample. We use cross validation technique
# Cross validation score is used to estimate accurarcy of the model
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Bagged Decision Trees for Classification

In [3]:
# upload the dataset
# read the data set
filename = 'pima-indians-diabetes.data.csv'
# Name the columns
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
# creating an array of the values
array = dataframe.values
# divide into independant and traget variables
X = array[:,0:8]
Y = array[:,8]
# defining the no. of folds to used on the data set. n_splits = 10.
# random_state = 42 is to preserve or lock the data.
# So that everytime we run the code the same values will be used for training or testing giving us the same accurarcies everythime
# shuffle = true so that random data is assigned to each fold
kfold = KFold(n_splits=10, random_state=42, shuffle = True)
# defining the model and initializing it
cart = DecisionTreeClassifier()
# defining the no. of trees to be used
num_trees = 500
# building the bagging classifier ensembled model using decision tree as base estimator, giving the number of trees to be used and locking the data through random state
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=42)
# Applying the ensembled model onto the x and y data on all the sub samples and then finding out the accurarcy
results = cross_val_score(model, X, Y, cv=kfold)
# printing out the mean of accurarcies of all the subsamples
print(results.mean())

0.7655502392344498


# Random Forest Classification

In [4]:
# Importing Libraries
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Dividing the datasset into independant variable and target variable
X = array[:,0:8]
Y = array[:,8]
# Defining the no. of trees to be used
num_trees = 100
# Defining the number of features to be used per tree. Max_features = âˆšn, where n is the no. of columns in a dataset.
# Here we have 9 colmns. So no. of features = 3
max_features = 3
# defining the no. of folds to used on the data set. n_splits = 10.
# random_state = 42 is to preserve or lock the data.
# So that everytime we run the code the same values will be used for training or testing giving us the same accurarcies everythime
# shuffle = true so that random data is assigned to each fold
kfold = KFold(n_splits=10, random_state=42, shuffle = True)
# building the model by using the no. of trees defined and the max_features fixed
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features, random_state=42)
# Applying the ensembled model onto the x and y data on all the sub samples and then finding out the accurarcy
results = cross_val_score(model, X, Y, cv=kfold)
# printing out the mean of accurarcies of all the subsamples
print(results.mean())

0.7694634313055365


# AdaBoost Classification

In [6]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [7]:
# upload the dataset
# read the data set
filename = 'pima-indians-diabetes.data.csv'
# Name the columns
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# divide into independant and traget variables
X = array[:,0:8]
Y = array[:,8]
# defining the no. of trees to be used. We can experiment with the no. of trees to see how the accurarcy is effected
num_trees = 10
# defining the no. of folds to used on the data set. n_splits = 10.
# random_state = 42 is to preserve or lock the data.
# So that everytime we run the code the same values will be used for training or testing giving us the same accurarcies everythime
# shuffle = true so that random data is assigned to each fold
kfold = KFold(n_splits=10, random_state=42, shuffle = True)
# building the boosting classifier ensembled model using decision tree as base estimator,
# giving the number of trees to be used and locking the data through random state
model = AdaBoostClassifier(n_estimators=num_trees, random_state=42)
# Applying the ensembled model onto the x and y data on all the sub samples and then finding out the accurarcy
results = cross_val_score(model, X, Y, cv=kfold)
# printing out the mean of accurarcies of all the subsamples
print(results.mean())


0.7577751196172249


# Stacking Ensemble for Classification

In [9]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC # for support vector machine model
from sklearn.ensemble import VotingClassifier # meta model

In [10]:
# upload the dataset
# read the data set
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# divide into independant and traget variables
X = array[:,0:8]
Y = array[:,8]
# defining the no. of folds to be used on the dataset
kfold = KFold(n_splits=10, random_state=42, shuffle = True)

# create the sub models
# building 3 individual models and appending them into an empty estimator list
estimators = []
# Model 1 - logistic regression
model1 = LogisticRegression(max_iter=500, random_state=42)
estimators.append(('logistic', model1))
# Model 2 - decision tree. We are using the cart method
model2 = DecisionTreeClassifier(random_state=42)
estimators.append(('cart', model2))
# Model 3 - support vector machine
model3 = SVC(random_state=42)
estimators.append(('svm', model3))

# create the ensemble model
# applying the meta model on the collection of these models
ensemble = VotingClassifier(estimators)
# Applying the ensembled model onto the x and y data on all the sub samples and then finding out the accurarcy
results = cross_val_score(ensemble, X, Y, cv=kfold)
# printing out the mean of accurarcies of all the subsamples
print(results.mean())

0.7708133971291866


In [11]:
# Hyperparamenter tuning - it is when we try out (tune) different no. of trees or max_feature or different no. of fold to improve the accurarcy value

In [12]:
# Most times we need to apply these models in our datasets and tune the hyperparameters.
# Check which model is working out well
# In most of the cases, the random forest model is highly effective. If you tune for the hyperparaments like - max_features, no. of trees. It'll give use high accurarcy
# Stacking model is also effective. It also gives us high levels of accurarcy
# therefore, we can apply these models in our project and select amongst these models which is the best one.

In [13]:
# We have see so far the following 4 types of ensemble techniques -
# 1. Bagging
# 2. Random forests - a type of bagging
# 3. Boosting - adaboost technique
# 4. Stacking

In [None]:
# Furthur we'll be seeing 2 types of advanced boosting techinques -
# 1. XG Boost - extreme gradient descent model or extreme gradient boosting model
# 2. LGBM - Light gradient boosting model