# Answer to graded assignment 2 in DTE-2501 (AI Methods and Applications) about ensemble methods by Abdullah Karagøz

## 1. Bootstrapping

In [247]:
import pandas as pd
from math import sqrt
from math import pi
from math import exp
from math import log

seed = 15

# Load the data
dataset = pd.read_csv('iris.data', header=None, names=['sepal length', 'sepal width',
                                                     'petal length', 'petal width',
                                                     'class'])

In [248]:
#Assign label to each class
dataset.loc[dataset['class'] == 'Iris-setosa', dataset.columns == 'class'] = 0
dataset.loc[dataset['class'] == 'Iris-versicolor', dataset.columns == 'class'] = 1
dataset.loc[dataset['class'] == 'Iris-virginica', dataset.columns == 'class'] = 2

In [252]:
# Shuffle and split data into training 80% and testing 20%
train_set = dataset.sample(frac=0.8, random_state=seed)
test_set = dataset.drop(train_set.index)
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)



In [253]:
train_set[train_set['class']==0].iloc[:,0].values.tolist()

test_set

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,4.9,3.0,1.4,0.2,0
1,5.4,3.7,1.5,0.2,0
2,5.7,4.4,1.5,0.4,0
3,5.1,3.5,1.4,0.3,0
4,5.1,3.8,1.5,0.3,0
5,5.1,3.3,1.7,0.5,0
6,5.0,3.4,1.6,0.4,0
7,4.9,3.1,1.5,0.1,0
8,4.9,3.1,1.5,0.1,0
9,5.1,3.4,1.5,0.2,0


### Naive Bayes Classifier

In [421]:
# Naive Bayes Classifier
class NaiveBayesClassifier():
    def __init__(self):
        self.dataset = 0
        self.mean_values = 0
        self.std_values = 0
        self.prior_class_probabilities = 0
        self.nr_of_classes = 0
        self.nr_of_attributes = 0
        
    def fit(self, train_set, in_ensemble=False, nr_of_classes = 0,
            nr_of_attributes=0):
        self.dataset = train_set
        
        # calculating mean, std values and prior probabilities
        
        # if not in ensemble, we calculate these values
        if not in_ensemble:
            self.nr_of_classes=train_set.groupby('class').ngroups
            self.nr_of_attributes=len(train_set.iloc[0,:-1])
        else:
            self.nr_of_classes = nr_of_classes
            self.nr_of_attributes = nr_of_attributes
        
        # Here we keep mean, std and prior class probability values
        self.mean_values = list()
        self.std_values = list()
        self.prior_class_probs = list()
        
        for i in range(self.nr_of_classes):
            class_values = train_set[train_set['class'] == i]
            prior_class_prob = len(class_values) / len(train_set)
            self.prior_class_probs.append(prior_class_prob)
            mean_values = list()
            std_values = list()   
            for j in range(self.nr_of_attributes):
                values = class_values.iloc[:,j].values.tolist()
                std, mean = self.std_and_mean(values)
                mean_values.append(mean)
                std_values.append(std)
            self.mean_values.append(mean_values)
            self.std_values.append(std_values)
        
        print(self.std_values)


    def mean(self, val_list):
        return sum(val_list) / len(val_list)
    
    def std_and_mean(self, val_list):
        mu = self.mean(val_list)
        std = sqrt(sum([(x - mu)**2 for x in val_list]) / (len(val_list)-1))
        return std, mu
        
    
    def gaussian_pdf(self, x, mu, sd):
        return_val = (1 / (sd*sqrt(2*pi))) * (exp(-0.5*((x-mu)/sd)**2))
        return return_val
    
    
    def pred_row(self, row, log_prob = False):
        probs = self.nr_of_classes*[0]  
        if log_prob:
            for i in range(self.nr_of_classes):
                probs[i] = 0 if self.prior_class_probs[i] == 0 else log(self.prior_class_probs[i])
                for j in range(len(row)):
                    gaussian_val = self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j])
                    probs[i] += -float('inf') if gaussian_val == 0 else log(gaussian_val)
        
        else: 
            for i in range(self.nr_of_classes):
                probs[i] = self.prior_class_probs[i]
                for j in range(len(row)):
                    probs[i] *= self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j])
        
        prediction = probs.index(max(probs))
        return probs, prediction   
    
    def predict(self, test_set, log_prob = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()
        probabilities = list()
        corrects = 0
        for x, y in zip(X_test, Y_test):
            prob, pred = self.pred_row(x, log_prob)
            probabilities.append(prob)
            if pred == y:
                corrects += 1    
        accuracy = corrects / len(Y_test)
        return probabilities, accuracy

In [422]:
nbc = NaiveBayesClassifier()
nbc.fit(train_set)

[[0.37467034543220634, 0.3755338080994054, 0.1757958575614411, 0.0982299486257503], [0.508704715728956, 0.28200069192096405, 0.4903955596027037, 0.1959342846528882], [0.6435279365561414, 0.32800875535531615, 0.5297314268715654, 0.2609646797989415]]


In [423]:
probabilities, accuracy = nbc.predict(test_set)
print(accuracy)

0.9333333333333333


### Ensembled Classifier

In [424]:
class EnsembledNBClassifier():
    def __init__(self, nr_of_classes):
        self.classifiers = list()
        for i in range(nr_of_classes):
            self.classifiers.append(NaiveBayesClassifier())
    
    def fit(self, train_set, seed=None):
        self.nr_of_classes=train_set.groupby('class').ngroups
        self.nr_of_attributes=len(train_set.iloc[0,:-1])
        for cl in self.classifiers:
            bag = train_set.sample(frac=1, replace=True, random_state=seed).reset_index(drop=True)
            cl.fit(bag, True, self.nr_of_classes, self.nr_of_attributes)
        
    
    def predict(self, test_set, log_prob = False, majority_vote = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()

        corrects = 0
        for x, y in zip(X_test, Y_test):
            preds = list()
            probs = list()

            for cl in self.classifiers:
                prob, pred = cl.pred_row(x, log_prob)
                preds.append(pred)
                probs.append(prob)
            if majority_vote:
                prediction = max(set(preds), key = preds.count)
            else:
                if log_prob:           
                    agg_probs = [sum(x)/len(probs) for x in zip(*probs)]
                else:
                    agg_probs = self.nr_of_classes*[1]
                    for i in range(len(probs[0])):
                        for j in range(len(probs)):
                            agg_probs[i] *= probs[j][i]
                        agg_probs[i] = agg_probs[i]**(1/len(probs))
                prediction = agg_probs.index(max(agg_probs))
            if prediction == y:
                corrects += 1
                
                
        accuracy = corrects / len(Y_test)
        return accuracy 
        
        

## Testing and validation

In [425]:
# Testing simple Naive Bayes Classifier
nbc = NaiveBayesClassifier()
nbc.fit(train_set)

_, accuracy = nbc.predict(test_set)
print("Accuracy", accuracy)

_, accuracy_log = nbc.predict(test_set, log_prob=True)
print("Accuracy with log", accuracy_log)

[[0.37467034543220634, 0.3755338080994054, 0.1757958575614411, 0.0982299486257503], [0.508704715728956, 0.28200069192096405, 0.4903955596027037, 0.1959342846528882], [0.6435279365561414, 0.32800875535531615, 0.5297314268715654, 0.2609646797989415]]
Accuracy 0.9333333333333333
Accuracy with log 0.9333333333333333


In [426]:
ens_nbc = EnsembledNBClassifier(10)
ens_nbc.fit(train_set)

accuracy = ens_nbc.predict(test_set)
print("Accuracy without log", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True)
print("Accuracy with log", accuracy)

accuracy = ens_nbc.predict(test_set, majority_vote=True)
print("Accuracy with majority vote", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True, majority_vote=True)
print("Accuracy with log, majority vote", accuracy)

[[0.4148333322533124, 0.37759852378060266, 0.16408961949436793, 0.11482806221027914], [0.4651135476901291, 0.28943999413364013, 0.4625792816769194, 0.19609452114857717], [0.7491104254964566, 0.3114231089271097, 0.6097885742313953, 0.2901047046965206]]
[[0.40153645798805887, 0.33601548169821976, 0.19752158951649598, 0.12034433356286307], [0.5413776239648991, 0.2873529065051644, 0.4313672516869322, 0.17527565180754215], [0.5933208224128562, 0.30681994361721093, 0.47000967108038394, 0.282842712474619]]
[[0.32697588446014214, 0.35863658367003626, 0.18440246046438766, 0.10630881770867996], [0.48691659034936047, 0.29023712486944575, 0.4895313651307836, 0.17030167225975226], [0.5911106196862668, 0.29234133835384435, 0.56486554486795, 0.2055599716890044]]
[[0.3938460484142909, 0.3161850873249617, 0.1896441728097696, 0.1163432958237426], [0.5392546334218883, 0.32465736581877913, 0.5370285187699513, 0.2048216209100724], [0.8381500722853957, 0.41543311162174956, 0.6179792842174607, 0.257161247680

In [427]:
arr = [1,2,3,4]
arr[2] = arr[2]**2
arr

[1, 2, 9, 4]

0.0