# Answer to graded assignment 2 in DTE-2501 (AI Methods and Applications) about ensemble methods by Abdullah Karagøz

## 1. Bootstrapping

In [28]:
import pandas as pd
import numpy as np
from math import sqrt
from math import pi
from math import exp
from math import log
from math import fsum
import platform
platform.architecture()
import random

seed = None

random.seed(seed)

In [29]:

# Load the data
dataset = pd.read_csv('iris.data', header=None, names=['sepal length', 'sepal width',
                                                     'petal length', 'petal width',
                                                     'class'])

In [30]:
#Assign label to each class
dataset.loc[dataset['class'] == 'Iris-setosa', dataset.columns == 'class'] = 0
dataset.loc[dataset['class'] == 'Iris-versicolor', dataset.columns == 'class'] = 1
dataset.loc[dataset['class'] == 'Iris-virginica', dataset.columns == 'class'] = 2

In [31]:
train_set[train_set['class']==0].iloc[:,0].values.tolist()

train_set

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.7,4.4,1.5,0.4,0
1,6.4,2.8,5.6,2.2,2
2,4.8,3.1,1.6,0.2,0
3,6.5,3.0,5.2,2.0,2
4,7.0,3.2,4.7,1.4,1
...,...,...,...,...,...
115,6.3,2.9,5.6,1.8,2
116,5.8,2.7,5.1,1.9,2
117,6.2,2.9,4.3,1.3,1
118,5.6,2.9,3.6,1.3,1


### Naive Bayes Classifier

In [35]:
# Naive Bayes Classifier
class NaiveBayesClassifier():
    def __init__(self):
        self.dataset = 0
        self.mean_values = 0
        self.std_values = 0
        self.prior_class_probabilities = 0
        self.nr_of_classes = 0
        self.nr_of_attributes = 0
        
    def fit(self, train_set, in_ensemble=False, nr_of_classes = 0,
            nr_of_attributes=0):
        self.dataset = train_set
        
        # calculating mean, std values and prior probabilities
        
        # if not in ensemble, we calculate these values
        if not in_ensemble:
            self.nr_of_classes=train_set.groupby('class').ngroups
            self.nr_of_attributes=len(train_set.iloc[0,:-1])
        else:
            self.nr_of_classes = nr_of_classes
            self.nr_of_attributes = nr_of_attributes
        
        # Here we keep mean, std and prior class probability values
        self.mean_values = list()
        self.std_values = list()
        self.prior_class_probs = list()
        
        for i in range(self.nr_of_classes):
            class_values = train_set[train_set['class'] == i]
            prior_class_prob = len(class_values) / len(train_set)
            self.prior_class_probs.append(prior_class_prob)
            mean_values = list()
            std_values = list()   
            for j in range(self.nr_of_attributes):
                values = class_values.iloc[:,j].values.tolist()
                std, mean = self.std_and_mean(values)
                mean_values.append(mean)
                std_values.append(std)
            self.mean_values.append(mean_values)
            self.std_values.append(std_values)


    def mean(self, val_list):
        return sum(val_list) / len(val_list)
    
    def std_and_mean(self, val_list):
        mu = self.mean(val_list)
        std = sqrt(fsum([(x - mu)**2 for x in val_list]) / (len(val_list)-1))
        return std, mu
        
    
    def gaussian_pdf(self, x, mu, sd, log_prob):
        if log_prob:
            return_val = (-0.5*((x-mu)/sd)**2) - log(sd*sqrt(2*pi))
        else:
            return_val = (1 / (sd*sqrt(2*pi))) * exp(-0.5*((x-mu)/sd)**2)
        return return_val
    
    
    def pred_row(self, row, log_prob = False):
        probs = self.nr_of_classes*[0]  
        if log_prob:
            for i in range(self.nr_of_classes):
                probs[i] = 0 if self.prior_class_probs[i] == 0 else log(self.prior_class_probs[i])
                for j in range(len(row)):
                    probs[i] += self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j], log_prob)
                probs[i] = exp(probs[i])
        
        else: 
            for i in range(self.nr_of_classes):
                probs[i] = self.prior_class_probs[i]
                for j in range(len(row)):
                    probs[i] *= self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j], log_prob)
        
        probs = [x / fsum(probs) for x in probs]
        prediction = probs.index(max(probs))
        
        return probs, prediction   
    
    def predict(self, test_set, log_prob = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()
        probabilities = list()
        predictions = list()
        corrects = 0
        for x, y in zip(X_test, Y_test):
            prob, pred = self.pred_row(x, log_prob)
            probabilities.append(prob)
            predictions.append(pred)
            if pred == y:
                corrects += 1    
        accuracy = corrects / len(Y_test)
        return probabilities, accuracy, predictions

### Ensembled Classifier

In [36]:
class EnsembledNBClassifier():
    def __init__(self, nr_of_classes):
        self.classifiers = list()
        for i in range(nr_of_classes):
            self.classifiers.append(NaiveBayesClassifier())
    
    def fit(self, train_set, seed=None):
        self.nr_of_classes=train_set.groupby('class').ngroups
        self.nr_of_attributes=len(train_set.iloc[0,:-1])
        # Initializing classifiers
        for cl in self.classifiers:
            bag = train_set.sample(frac=1, replace=True, random_state=seed).reset_index(drop=True)
            cl.fit(bag, True, self.nr_of_classes, self.nr_of_attributes)
        
    
    def predict(self, test_set, log_prob = False, majority_vote = False):
        # splitting the data
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()

        corrects = 0 # nr of correct predictions
        for x, y in zip(X_test, Y_test):
            preds = list()
            probs = list()

            for cl in self.classifiers:
                prob, pred = cl.pred_row(x, log_prob) # predict each row
                preds.append(pred)
                probs.append(prob)
            if majority_vote:
                # Breaking ties with random choice
                prediction = max(set(preds), key = lambda x: preds.count(x) + 0.1*random.random())
            else:
                if log_prob:
                    # arithmetic mean
                    agg_probs = [fsum(x)/len(probs) for x in zip(*probs)]
                else:
                    agg_probs = [fsum(x)/len(probs) for x in zip(*probs)]
#                     agg_probs = self.nr_of_classes*[1]
#                     for i in range(len(probs[0])):
#                         for j in range(len(probs)):
#                             agg_probs[i] *= probs[j][i]
#                         agg_probs[i] = agg_probs[i]**(1/len(probs))
                # argmax of the maximum choice
                prediction = agg_probs.index(max(agg_probs))
            if prediction == y:
                corrects += 1
                
        # count accuracy
        accuracy = corrects / len(Y_test)
        return accuracy 
        
        

## Testing and validation

In [37]:
train_test_split = 0.8

# Shuffle and split data into training 80% and testing 20%
train_set = dataset.sample(frac=train_test_split, random_state=seed)
test_set = dataset.drop(train_set.index)
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

nbc = NaiveBayesClassifier()
nbc.fit(train_set)

probabilities, accuracy, preds = nbc.predict(test_set, log_prob=True)
preds = np.array(preds)

probabilities2, accuracy2, preds2 = nbc.predict(test_set, log_prob=False)
preds2 = np.array(preds)

from sklearn.naive_bayes import GaussianNB

X_train = train_set.iloc[:,:-1].to_numpy()
Y_train = train_set.iloc[:,-1].to_numpy().astype('int')
X_test = test_set.iloc[:,:-1].to_numpy()
Y_test = test_set.iloc[:,-1].to_numpy().astype('int')

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

Y_pred = gnb.predict(X_test)

print("Accuracy with sklearn classifier", (Y_test == Y_pred).sum() / len(Y_test))
print("Accuracy with my classifier with log prob", accuracy)
print("Accuracy with my classifier without log prob", accuracy2)
print("Predictions with sklearn classifier", Y_pred)
print("Predictions with log prob", preds)
print("Predictions without log prob", preds2)
print("Values from test set", Y_test)


from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(GaussianNB(), n_estimators=10)

bagging.fit(X_train, Y_train)

Y_pred = bagging.predict(X_test)

print("Accuracy with sklearn ensembled classifier", (Y_test == Y_pred).sum() / len(Y_test))

ens_nbc = EnsembledNBClassifier(10)
ens_nbc.fit(train_set, seed)

accuracy = ens_nbc.predict(test_set)
print("Ensemble accuracy with aggregation", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True)
print("Ensemble accuracy with log and aggregation", accuracy)

accuracy = ens_nbc.predict(test_set, majority_vote=True)
print("Accuracy with majority vote", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True, majority_vote=True)
print("Accuracy with log and majority vote", accuracy)

probs [0.9999999999999989, 1.1014533204928002e-15, 2.937269768647707e-24]
probs [1.0, 4.070172260962441e-17, 5.228286905491405e-25]
probs [0.9999999999991893, 8.106886385287642e-13, 4.431231441656734e-20]
probs [0.9999999999999104, 8.964644105585103e-14, 2.146864381771565e-21]
probs [0.9999999999980904, 1.9096910816883832e-12, 5.0312263490143836e-20]
probs [0.9999999999999961, 3.889185995774775e-15, 2.5408117280881083e-23]
probs [0.9999999999999917, 8.365076303466438e-15, 4.520796481896116e-23]
probs [0.9999999999991939, 8.061518959675969e-13, 6.387456308476276e-21]
probs [0.9999999999999996, 4.944948430830768e-16, 2.6788650365926155e-24]
probs [0.9999999999999999, 6.971644457536595e-17, 1.8693663086298545e-25]
probs [0.9999999999081146, 9.1885328859068e-11, 5.03244601302297e-18]
probs [0.9999999999999883, 1.1670384217606378e-14, 2.0982692451986738e-23]
probs [8.176464218964133e-110, 0.9979290081088095, 0.002070991891190595]
probs [3.339400334055089e-101, 0.9926128652320344, 0.00738713

In [1178]:
from random import random
arr = [0.0000000001, 0.00000000011, 0.0000000002, 0.00000000012, 0.0000000002]
max(set(arr), key = lambda x: arr.count(x) + random())

1e-10

In [1267]:
arr.index(max(arr, key=lambda x: random()))

3

int