# Answer to graded assignment 2 in DTE-2501 (AI Methods and Applications) about ensemble methods by Abdullah Karagøz

## 1. Bootstrapping

In [247]:
import pandas as pd
from math import sqrt
from math import pi
from math import exp
from math import log

seed = 15

# Load the data
dataset = pd.read_csv('iris.data', header=None, names=['sepal length', 'sepal width',
                                                     'petal length', 'petal width',
                                                     'class'])

In [248]:
#Assign label to each class
dataset.loc[dataset['class'] == 'Iris-setosa', dataset.columns == 'class'] = 0
dataset.loc[dataset['class'] == 'Iris-versicolor', dataset.columns == 'class'] = 1
dataset.loc[dataset['class'] == 'Iris-virginica', dataset.columns == 'class'] = 2

In [252]:
# Shuffle and split data into training 80% and testing 20%
train_set = dataset.sample(frac=0.8, random_state=seed)
test_set = dataset.drop(train_set.index)
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)



In [253]:
train_set[train_set['class']==0].iloc[:,0].values.tolist()

test_set

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,4.9,3.0,1.4,0.2,0
1,5.4,3.7,1.5,0.2,0
2,5.7,4.4,1.5,0.4,0
3,5.1,3.5,1.4,0.3,0
4,5.1,3.8,1.5,0.3,0
5,5.1,3.3,1.7,0.5,0
6,5.0,3.4,1.6,0.4,0
7,4.9,3.1,1.5,0.1,0
8,4.9,3.1,1.5,0.1,0
9,5.1,3.4,1.5,0.2,0


### Naive Bayes Classifier

In [254]:
# Naive Bayes Classifier
class NaiveBayesClassifier():
    def __init__(self):
        self.dataset = 0
        self.mean_values = 0
        self.std_values = 0
        self.prior_class_probabilities = 0
        self.nr_of_classes = 0
        self.nr_of_attributes = 0
        
    def fit(self, train_set, in_ensemble=False, nr_of_classes = 0,
            nr_of_attributes=0):
        self.dataset = train_set
        
        # calculating mean, std values and prior probabilities
        
        # if not in ensemble, we calculate these values
        if not in_ensemble:
            self.nr_of_classes=train_set.groupby('class').ngroups
            self.nr_of_attributes=len(train_set.iloc[0,:-1])
        else:
            self.nr_of_classes = nr_of_classes
            self.nr_of_attributes = nr_of_attributes
        
        # Here we keep mean, std and prior class probability values
        self.mean_values = list()
        self.std_values = list()
        self.prior_class_probs = list()
        
        for i in range(self.nr_of_classes):
            class_values = train_set[train_set['class'] == i]
            prior_class_prob = len(class_values) / len(train_set)
            self.prior_class_probs.append(prior_class_prob)
            mean_values = list()
            std_values = list()   
            for j in range(self.nr_of_attributes):
                values = class_values.iloc[:,j].values.tolist()
                std, mean = self.std_and_mean(values)
                mean_values.append(mean)
                std_values.append(std)
            self.mean_values.append(mean_values)
            self.std_values.append(std_values)
            


    def mean(self, val_list):
        return sum(val_list) / len(val_list)
    
    def std_and_mean(self, val_list):
        mu = self.mean(val_list)
        std = sqrt(sum([(x - mu)**2 for x in val_list]) / (len(val_list)-1))
        return std, mu
        
    
    def gaussian_pdf(self, x, mu, sd):
        return (1 / (sd*sqrt(2*pi))) * exp(-0.5*((x-mu)/sd)**2)
    
    
    def pred_row(self, row, log_prob = False):
        probs = self.nr_of_classes*[0]  
        if log_prob:
            for i in range(self.nr_of_classes):
                probs[i] = log(self.prior_class_probs[i])
                for j in range(len(row)):
                    probs[i] += log(self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j]))
        
        else: 
            for i in range(self.nr_of_classes):
                probs[i] = self.prior_class_probs[i]
                for j in range(len(row)):
                    probs[i] *= self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j])
        
        prediction = probs.index(max(probs))
        return probs, prediction   
    
    def predict(self, test_set, log_prob = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()
        probabilities = list()
        corrects = 0
        for x, y in zip(X_test, Y_test):
            prob, pred = self.pred_row(x, log_prob)
            probabilities.append(prob)
            if pred == y:
                corrects += 1    
        accuracy = corrects / len(Y_test)
        return probabilities, accuracy

In [255]:
nbc = NaiveBayesClassifier()
nbc.fit(train_set)

In [257]:
probabilities, accuracy = nbc.predict(test_set)
print(accuracy)

0.9333333333333333


### Ensembled Classifier

In [298]:
class EnsembledNBClassifier():
    def __init__(self, nr_of_classes):
        self.classifiers = list()
        for i in range(nr_of_classes):
            self.classifiers.append(NaiveBayesClassifier())
    
    def fit(self, train_set, seed=None):
        self.nr_of_classes=train_set.groupby('class').ngroups
        self.nr_of_attributes=len(train_set.iloc[0,:-1])
        for cl in self.classifiers:
            bag = train_set.sample(frac=1, replace=True, random_state=seed).reset_index(drop=True)
            cl.fit(bag, True, self.nr_of_classes, self.nr_of_attributes)
        
    
    def predict(self, test_set, log_prob = False, majority_vote = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()

        corrects = 0
        for x, y in zip(X_test, Y_test):
            preds = list()
            probs = list()

            for cl in self.classifiers:
                prob, pred = cl.pred_row(x, log_prob)
                preds.append(pred)
                probs.append(prob)
            if majority_vote:
                prediction = max(set(preds), key = preds.count)
            else:
                if log_prob:           
                    agg_probs = [sum(x)/len(probs) for x in zip(*probs)]
                else:
                    agg_probs = self.nr_of_classes*[1]
                    for i in range(len(probs[0])):
                        for j in range(len(probs)):
                            agg_probs[i] *= probs[j][i]
                        agg_probs[i] = agg_probs[i]**(1/len(probs))
                    print("agg_probs", agg_probs)
                prediction = agg_probs.index(max(agg_probs))
            if prediction == y:
                corrects += 1
                
                
        accuracy = corrects / len(Y_test)
        return accuracy 
        
        

## Testing and validation

In [302]:
# Testing simple Naive Bayes Classifier
nbc = NaiveBayesClassifier()
nbc.fit(train_set)

_, accuracy = nbc.predict(test_set)
print("Accuracy", accuracy)

_, accuracy_log = nbc.predict(test_set, log_prob=True)
print("Accuracy with log", accuracy_log)

Accuracy 0.9333333333333333
Accuracy with log 0.9333333333333333


In [304]:
ens_nbc = EnsembledNBClassifier(10)
ens_nbc.fit(train_set, seed)

accuracy = ens_nbc.predict(test_set)
print("Accuracy without log", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True)
print("Accuracy with log", accuracy)

accuracy = ens_nbc.predict(test_set, majority_vote=True)
print("Accuracy with majority vote", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True, majority_vote=True)
print("Accuracy with log, majority vote", accuracy)

agg_probs [1.7370661585038423, 5.248602708495512e-15, 6.913955165261866e-26]
agg_probs [0.4556951768206084, 5.092903442876646e-16, 2.3554178533491195e-26]
agg_probs [0.00012547034155329742, 1.5446348466918402e-18, 1.4085222093854465e-28]
agg_probs [2.120856008874183, 7.427839002151295e-15, 2.6140705285082213e-25]
agg_probs [0.7262546003232074, 7.887542389585376e-16, 4.27661868228808e-26]
agg_probs [0.17269359016976543, 4.0552164622019316e-11, 7.723162028063709e-21]
agg_probs [1.1724325691663677, 7.514075604998595e-13, 6.929531260386569e-23]
agg_probs [1.2548728532949982, 6.834146286820386e-16, 1.1231647720274061e-26]
agg_probs [1.2548728532949982, 6.834146286820386e-16, 1.1231647720274061e-26]
agg_probs [2.5690554438428657, 3.801200951610037e-15, 1.237183896006798e-25]
agg_probs [0.0366892810084327, 2.5841234882158393e-13, 8.103402162664932e-23]
agg_probs [0.0, 0.012011921504527533, 0.0037405565010077343]
agg_probs [0.0, 0.05698265614018797, 1.8775603647895137e-06]
agg_probs [0.0, 0.03