# Answer to graded assignment 2 in DTE-2501 (AI Methods and Applications) about ensemble methods by Abdullah Karagøz

## 1. Bootstrapping

In [172]:
import pandas as pd
import numpy as np
from math import sqrt
from math import pi
from math import exp
from math import log
from math import fsum
import platform
platform.architecture()
import random

seed = None

random.seed(seed)

In [173]:

# Load the data
dataset = pd.read_csv('iris.data', header=None, names=['sepal length', 'sepal width',
                                                     'petal length', 'petal width',
                                                     'class'])

In [174]:
#Assign label to each class
dataset.loc[dataset['class'] == 'Iris-setosa', dataset.columns == 'class'] = 0
dataset.loc[dataset['class'] == 'Iris-versicolor', dataset.columns == 'class'] = 1
dataset.loc[dataset['class'] == 'Iris-virginica', dataset.columns == 'class'] = 2

In [176]:
train_set[train_set['class']==0].iloc[:,0].values.tolist()

train_set

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.7,2.8,4.5,1.3,1
1,6.4,2.8,5.6,2.1,2
2,5.7,3.0,4.2,1.2,1
3,6.7,3.1,4.7,1.5,1
4,5.0,2.3,3.3,1.0,1
...,...,...,...,...,...
115,4.9,3.1,1.5,0.1,0
116,6.4,3.2,5.3,2.3,2
117,4.4,3.2,1.3,0.2,0
118,5.4,3.9,1.7,0.4,0


### Naive Bayes Classifier

In [177]:
# Naive Bayes Classifier
class NaiveBayesClassifier():
    def __init__(self):
        self.dataset = 0
        self.mean_values = 0
        self.std_values = 0
        self.prior_class_probabilities = 0
        self.nr_of_classes = 0
        self.nr_of_attributes = 0
        
    def fit(self, train_set, in_ensemble=False, nr_of_classes = 0,
            nr_of_attributes=0):
        self.dataset = train_set
        
        # calculating mean, std values and prior probabilities
        
        # if not in ensemble, we calculate these values
        if not in_ensemble:
            self.nr_of_classes=train_set.groupby('class').ngroups
            self.nr_of_attributes=len(train_set.iloc[0,:-1])
        else:
            self.nr_of_classes = nr_of_classes
            self.nr_of_attributes = nr_of_attributes
        
        # Here we keep mean, std and prior class probability values
        self.mean_values = list()
        self.std_values = list()
        self.prior_class_probs = list()
        
        for i in range(self.nr_of_classes):
            class_values = train_set[train_set['class'] == i]
            prior_class_prob = len(class_values) / len(train_set)
            self.prior_class_probs.append(prior_class_prob)
            mean_values = list()
            std_values = list()   
            for j in range(self.nr_of_attributes):
                values = class_values.iloc[:,j].values.tolist()
                std, mean = self.std_and_mean(values)
                mean_values.append(mean)
                std_values.append(std)
            self.mean_values.append(mean_values)
            self.std_values.append(std_values)


    def mean(self, val_list):
        return sum(val_list) / len(val_list)
    
    def std_and_mean(self, val_list):
        mu = self.mean(val_list)
        std = sqrt(fsum([(x - mu)**2 for x in val_list]) / (len(val_list)-1))
        return std, mu
        
    
    def gaussian_pdf(self, x, mu, sd, log_prob):
        if log_prob:
            return_val = (-0.5*((x-mu)/sd)**2) - log(sd*sqrt(2*pi))
        else:
            return_val = (1 / (sd*sqrt(2*pi))) * exp(-0.5*((x-mu)/sd)**2)
        return return_val
    
    
    def pred_row(self, row, log_prob = False):
        probs = self.nr_of_classes*[0]
        if log_prob:
            for i in range(self.nr_of_classes):
                probs[i] = 0 if self.prior_class_probs[i] == 0 else log(self.prior_class_probs[i])
                for j in range(len(row)):
                    probs[i] += self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j], log_prob)
                probs[i] = exp(probs[i])
        
        else: 
            for i in range(self.nr_of_classes):
                probs[i] = self.prior_class_probs[i]
                for j in range(len(row)):
                    probs[i] *= self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j], log_prob)
        
        probs = [x / fsum(probs) for x in probs]
        prediction = probs.index(max(probs))
        return probs, prediction 
    
    def predict(self, test_set, log_prob = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()
        probabilities = list()
        predictions = list()
        corrects = 0
        cel = 0 # Cross entropy loss
        for x, y in zip(X_test, Y_test):
            prob, pred = self.pred_row(x, log_prob)
            cel += -log(prob[y])
            probabilities.append(prob)
            predictions.append(pred)
            if pred == y:
                corrects += 1    
        accuracy = corrects / len(Y_test)
        return probabilities, accuracy, predictions, cel

### Ensembled Classifier

In [178]:
class EnsembledNBClassifier():
    def __init__(self, nr_of_classes):
        self.classifiers = list()
        for i in range(nr_of_classes):
            self.classifiers.append(NaiveBayesClassifier())
    
    def fit(self, train_set, seed=None):
        self.nr_of_classes=train_set.groupby('class').ngroups
        self.nr_of_attributes=len(train_set.iloc[0,:-1])
        # Initializing classifiers
        for cl in self.classifiers:
            bag = train_set.sample(frac=1, replace=True, random_state=seed).reset_index(drop=True)
            cl.fit(bag, True, self.nr_of_classes, self.nr_of_attributes)
        
    
    def predict(self, test_set, log_prob = False, majority_vote = False):
        # splitting the data
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()

        corrects = 0 # nr of correct predictions
        cel = 0 # Cross entropy loss
        for x, y in zip(X_test, Y_test):
            preds = list()
            probs = list()

            for cl in self.classifiers:
                prob, pred = cl.pred_row(x, log_prob) # predict each row
                preds.append(pred)
                probs.append(prob)
            if majority_vote:
                # Breaking ties with random choice
                prediction = max(set(preds), key = lambda x: preds.count(x) + 0.1*random.random())
            else:
                if log_prob:
                    # arithmetic mean
                    agg_probs = [fsum(x)/len(probs) for x in zip(*probs)]
                else:
                    agg_probs = [fsum(x)/len(probs) for x in zip(*probs)]
#                     agg_probs = self.nr_of_classes*[1]
#                     for i in range(len(probs[0])):
#                         for j in range(len(probs)):
#                             agg_probs[i] *= probs[j][i]
#                         agg_probs[i] = agg_probs[i]**(1/len(probs))
                # argmax of the maximum choice
                prediction = agg_probs.index(max(agg_probs))
                cel += -log(agg_probs[y])
            if prediction == y:
                corrects += 1
                
        # count accuracy
        accuracy = corrects / len(Y_test)
        return accuracy, cel
        
        

## Testing and validation

In [183]:
train_test_split = 0.8

# Shuffle and split data into training 80% and testing 20%
train_set = dataset.sample(frac=train_test_split, random_state=seed)
test_set = dataset.drop(train_set.index)
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

nbc = NaiveBayesClassifier()
nbc.fit(train_set)

probabilities, accuracy, preds, cel = nbc.predict(test_set, log_prob=True)
preds = np.array(preds)

probabilities2, accuracy2, preds2, cel2 = nbc.predict(test_set, log_prob=False)
preds2 = np.array(preds2)

from sklearn.naive_bayes import GaussianNB

X_train = train_set.iloc[:,:-1].to_numpy()
Y_train = train_set.iloc[:,-1].to_numpy().astype('int')
X_test = test_set.iloc[:,:-1].to_numpy()
Y_test = test_set.iloc[:,-1].to_numpy().astype('int')

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

Y_pred = gnb.predict(X_test)

print("Accuracy with sklearn classifier", (Y_test == Y_pred).sum() / len(Y_test))
print("Accuracy with my classifier with log prob", accuracy, "Cross entropy loss", cel)
print("Accuracy with my classifier without log prob", accuracy2, "Cross entropy loss", cel2)
print("Predictions with sklearn classifier", Y_pred)
print("Predictions with log prob", preds)
print("Predictions without log prob", preds2)
print("Values from test set", Y_test)


from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(GaussianNB(), n_estimators=100)

bagging.fit(X_train, Y_train)

Y_pred = bagging.predict(X_test)

print("Accuracy with sklearn ensembled classifier", (Y_test == Y_pred).sum() / len(Y_test))

ens_nbc = EnsembledNBClassifier(100)
ens_nbc.fit(train_set, seed)

accuracy, cel = ens_nbc.predict(test_set)
print("Ensemble accuracy with aggregation", accuracy, "Cross entropy loss", cel)

accuracy, cel = ens_nbc.predict(test_set, log_prob=True)
print("Ensemble accuracy with log and aggregation", accuracy, "Cross entropy loss", cel)

accuracy, cel = ens_nbc.predict(test_set, majority_vote=True)
print("Accuracy with majority vote", accuracy)

accuracy, cel = ens_nbc.predict(test_set, log_prob=True, majority_vote=True)
print("Accuracy with log and majority vote", accuracy)

Accuracy with sklearn classifier 0.9333333333333333
Accuracy with my classifier with log prob 0.9333333333333333 Cross entropy loss 7.56394488432537
Accuracy with my classifier without log prob 0.9333333333333333 Cross entropy loss 7.56394488432537
Predictions with sklearn classifier [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2]
Predictions with log prob [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2]
Predictions without log prob [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2]
Values from test set [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2]
Accuracy with sklearn ensembled classifier 0.9333333333333333
Ensemble accuracy with aggregation 0.9333333333333333 Cross entropy loss 6.843444364366109
Ensemble accuracy with log and aggregation 0.9333333333333333 Cross entropy loss 6.843444364366109
Accuracy with majority vote 0.9333333333333333
Accuracy with log and majority vote 0.9333333333333333


In [184]:
# Testing 1000 times and see average accuracy with Sklearn

n = 1000
single_accuracies = list()
ensemble_accuracies = list()

nr_of_classifiers = 1000

for i in range(n):
    train_test_split = 0.8

    # Shuffle and split data into training 80% and testing 20%
    train_set = dataset.sample(frac=train_test_split, random_state=seed)
    test_set = dataset.drop(train_set.index)
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)

    X_train = train_set.iloc[:,:-1].to_numpy()
    Y_train = train_set.iloc[:,-1].to_numpy().astype('int')
    X_test = test_set.iloc[:,:-1].to_numpy()
    Y_test = test_set.iloc[:,-1].to_numpy().astype('int')

    gnb = GaussianNB()
    gnb.fit(X_train, Y_train)

    Y_pred = gnb.predict(X_test)

    single_accuracies.append((Y_test == Y_pred).sum() / len(Y_test))


    bagging = BaggingClassifier(GaussianNB(), n_estimators=nr_of_classifiers)

    bagging.fit(X_train, Y_train)

    Y_pred = bagging.predict(X_test)

    ensemble_accuracies.append((Y_test == Y_pred).sum() / len(Y_test))

    
mean_single_accuracy = fsum(single_accuracies) / n

mean_ensemble_accuracy = fsum(ensemble_accuracies) / n

print("Single accuracy", mean_single_accuracy)
print("Ensemble accuracy", mean_ensemble_accuracy)


Single accuracy 0.9525333333333333
Ensemble accuracy 0.9522333333333334


In [185]:
# Testing 100 times and see average accuracy and cel

n = 1000

nr_of_classifiers = 1000

single_accuracies_skl = list()
ensemble_accuracies_skl = list()
single_accuracies_1 = list()
single_accuracies_2 = list()
ensemble_accuracies_1 = list()
ensemble_accuracies_2 = list()
ensemble_accuracies_3 = list()
ensemble_accuracies_4 = list()
single_cels_1 = list()
single_cels_2 = list()
ensemble_cels_1 = list()
ensemble_cels_2 = list()


for i in range(n):
    train_test_split = 0.8

    # Shuffle and split data into training 80% and testing 20%
    train_set = dataset.sample(frac=train_test_split, random_state=seed)
    test_set = dataset.drop(train_set.index)
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)

    X_train = train_set.iloc[:,:-1].to_numpy()
    Y_train = train_set.iloc[:,-1].to_numpy().astype('int')
    X_test = test_set.iloc[:,:-1].to_numpy()
    Y_test = test_set.iloc[:,-1].to_numpy().astype('int')

    gnb = GaussianNB()
    gnb.fit(X_train, Y_train)

    Y_pred = gnb.predict(X_test)

    single_accuracies_skl.append((Y_test == Y_pred).sum() / len(Y_test))

    nbc = NaiveBayesClassifier()
    nbc.fit(train_set)

    _, accuracy1, _, cel1 = nbc.predict(test_set, log_prob=True)


    _, accuracy2, _, cel2 = nbc.predict(test_set, log_prob=False)


    single_accuracies_1.append(accuracy1)
    single_accuracies_2.append(accuracy2)
    single_cels_1.append(cel1)
    single_cels_2.append(cel2)

    bagging = BaggingClassifier(GaussianNB(), n_estimators=nr_of_classifiers)

    bagging.fit(X_train, Y_train)

    Y_pred = bagging.predict(X_test)

    ensemble_accuracies_skl.append((Y_test == Y_pred).sum() / len(Y_test))

    ens_nbc = EnsembledNBClassifier(nr_of_classifiers)
    ens_nbc.fit(train_set, seed)

    accuracy, cel = ens_nbc.predict(test_set)
    ensemble_accuracies_1.append(accuracy)
    ensemble_cels_1.append(cel)

    accuracy, cel = ens_nbc.predict(test_set, log_prob=True)
    ensemble_accuracies_2.append(accuracy)
    ensemble_cels_2.append(cel)


    accuracy, _ = ens_nbc.predict(test_set, majority_vote=True)
    ensemble_accuracies_3.append(accuracy)

    accuracy, _ = ens_nbc.predict(test_set, log_prob=True, majority_vote=True)
    ensemble_accuracies_4.append(accuracy)
    


mean_single_accuracy_skl = fsum(single_accuracies_skl) / n
mean_single_accuracy_1 = fsum(single_accuracies_1) / n
mean_single_accuracy_2 = fsum(single_accuracies_2) / n
mean_single_cel_1 = fsum(single_cels_1) / n
mean_single_cel_2 = fsum(single_cels_2) / n

mean_ensemble_accuracy_skl = fsum(ensemble_accuracies_skl) / n
mean_ensemble_accuracy_1 = fsum(ensemble_accuracies_1) / n
mean_ensemble_accuracy_2 = fsum(ensemble_accuracies_2) / n
mean_ensemble_accuracy_3 = fsum(ensemble_accuracies_3) / n
mean_ensemble_accuracy_4 = fsum(ensemble_accuracies_4) / n
mean_ensemble_cel_1 = fsum(ensemble_cels_1) / n
mean_ensemble_cel_2 = fsum(ensemble_cels_2) / n

print("Single accuracy sklearn", mean_single_accuracy_skl)
print("Single accuracy with log", mean_single_accuracy_1)
print("Single cel with log", mean_single_cel_1)
print("Single accuracy without log", mean_single_accuracy_2)
print("Single cel without log", mean_single_cel_2)
print("Ensemble accuracy sklearn", mean_ensemble_accuracy_skl)
print("Ensemble accuracy", mean_ensemble_accuracy_1)
print("Ensemble cel", mean_ensemble_cel_1)
print("Ensemble accuracy with log", mean_ensemble_accuracy_2)
print("Ensemble cel with log", mean_ensemble_cel_2)
print("Ensemble accuracy with majority vote", mean_ensemble_accuracy_3)
print("Ensemble accuracy with log and majority vote", mean_ensemble_accuracy_4)


Single accuracy sklearn 0.9525666666666667
Single accuracy with log 0.9528333333333334
Single cel with log 4.138827005881541
Single accuracy without log 0.9528333333333334
Single cel without log 4.138827005881541
Ensemble accuracy sklearn 0.9521333333333333
Ensemble accuracy 0.9524666666666667
Ensemble cel 3.9306941522192274
Ensemble accuracy with log 0.9524666666666667
Ensemble cel with log 3.9306941522192274
Ensemble accuracy with majority vote 0.9525666666666667
Ensemble accuracy with log and majority vote 0.9526


In [1267]:
arr.index(max(arr, key=lambda x: random()))

3

In [77]:
from random import random
arr = [0.0000000001, 0.00000000011, 0.0000000002, 0.00000000012, 0.0000000002]
max(set(arr), key = lambda x: arr.count(x) + random())

int