# Table of Contents

>[1. Data Preparation](#scrollTo=UxfZrSmk_Jgq)

>[2. Voting Classifier](#scrollTo=bfO6AiFH1Pn9)

>[3. Bagging (Bootstrap Aggregating)](#scrollTo=yTeBlcv_1Pn-)

>[4. Random Forests](#scrollTo=50rM43e91Pn-)



In [2]:
import numpy as np
import pandas as pd

# 1. Data Preparation

In [3]:
# 1.1 Prepare Data
# ========================
from sklearn.model_selection import train_test_split

# read csv file
iris_df = pd.read_csv("C:/Users/Vic/Desktop/Data Scienece/Datasets-20231016/Dataset_Iris.csv")

X = iris_df.drop(["Species", "Id"], axis=1)
y = iris_df["Species"]

# Use stratified sampling to split up the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, stratify=y, random_state = 1)

<a class="anchor" id="chapter_1"></a>

# 2. Voting Classifier
- N classifiers make their predictions, highest score wins
- Source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html


In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
# 2.1 Initialize Models
# ========================
# Instantiate lr
lr = LogisticRegression(random_state=42, max_iter = 500) # instantiate first classifier

# Instantiate knn
knn = KNeighborsClassifier(n_neighbors=27) # instantiate second classifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=42) # third classifier = decision tree classifier

# Define the list classifiers
classifiers = [('Logistic Regression', lr),         #we plugged all classifiers into a list
               ('K Nearest Neighbours', knn),       
               ('Classification Tree', dt)]

# 2.2 Train Models and make Predictions
# =====================================
# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:                    #for each combination of classifier name and classifier we perform the training process
    # Fit clf to the training set                    # and predict values and get accuracy score as output
    clf.fit(X_train, y_train)

    # Predict y_pred
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print('{:s} : {:.3f}'.format(clf_name, accuracy))


Logistic Regression : 0.933
K Nearest Neighbours : 0.642
Classification Tree : 0.950


In [6]:
# 2.3 Make a Vote
# =====================================
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers, voting="soft") 

# Fit vc to the training set
vc.fit(X_train, y_train)

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))

print(classification_report(y_test, y_pred))


Voting Classifier: 0.950
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        40
Iris-versicolor       0.97      0.88      0.92        40
 Iris-virginica       0.89      0.97      0.93        40

       accuracy                           0.95       120
      macro avg       0.95      0.95      0.95       120
   weighted avg       0.95      0.95      0.95       120



<a class="anchor" id="chapter_2"></a>

# 3. Bagging (Bootstrap Aggregating)

In [7]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=42)

# Instantiate bc (runs 50 times due to n_estimators)
bc = BaggingClassifier(estimator=dt, n_estimators=50, random_state=42) 

# Fit bc to the training set
bc.fit(X_train, y_train)
#bc.fit(X, y)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bagging classifier: {:.2f}'.format(acc_test))
print()
print(classification_report(y_test, y_pred))

Test set accuracy of bagging classifier: 0.93

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        40
Iris-versicolor       0.92      0.88      0.90        40
 Iris-virginica       0.88      0.93      0.90        40

       accuracy                           0.93       120
      macro avg       0.93      0.93      0.93       120
   weighted avg       0.93      0.93      0.93       120



<a class="anchor" id="chapter_3"></a>

# 4. Random Forests

In [8]:
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

# Instantiate rf
rf = RandomForestClassifier(n_estimators=50,
                            random_state=42)

# Fit rf to the training set
rf.fit(X_train, y_train)   #training

# Predict the test set labels
y_pred = rf.predict(X_test) #predicting

# Evaluate the test set
acc_test = accuracy_score(y_test, y_pred)

# Print rmse_test
print('Test set accuracy of rf: {:.2f}'.format(acc_test))
print()
print(classification_report(y_test, y_pred))

Test set accuracy of rf: 0.94

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        40
Iris-versicolor       0.95      0.88      0.91        40
 Iris-virginica       0.88      0.95      0.92        40

       accuracy                           0.94       120
      macro avg       0.94      0.94      0.94       120
   weighted avg       0.94      0.94      0.94       120

