# Answer to graded assignment 2 in DTE-2501 (AI Methods and Applications) about ensemble methods by Abdullah Karagøz

## 1. Bootstrapping

In [247]:
import pandas as pd
from math import sqrt
from math import pi
from math import exp
from math import log

seed = 15

# Load the data
dataset = pd.read_csv('iris.data', header=None, names=['sepal length', 'sepal width',
                                                     'petal length', 'petal width',
                                                     'class'])

In [248]:
#Assign label to each class
dataset.loc[dataset['class'] == 'Iris-setosa', dataset.columns == 'class'] = 0
dataset.loc[dataset['class'] == 'Iris-versicolor', dataset.columns == 'class'] = 1
dataset.loc[dataset['class'] == 'Iris-virginica', dataset.columns == 'class'] = 2

In [252]:
# Shuffle and split data into training 80% and testing 20%
train_set = dataset.sample(frac=0.8, random_state=seed)
test_set = dataset.drop(train_set.index)
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)



In [253]:
train_set[train_set['class']==0].iloc[:,0].values.tolist()

test_set

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,4.9,3.0,1.4,0.2,0
1,5.4,3.7,1.5,0.2,0
2,5.7,4.4,1.5,0.4,0
3,5.1,3.5,1.4,0.3,0
4,5.1,3.8,1.5,0.3,0
5,5.1,3.3,1.7,0.5,0
6,5.0,3.4,1.6,0.4,0
7,4.9,3.1,1.5,0.1,0
8,4.9,3.1,1.5,0.1,0
9,5.1,3.4,1.5,0.2,0


### Naive Bayes Classifier

In [254]:
# Naive Bayes Classifier
class NaiveBayesClassifier():
    def __init__(self):
        self.dataset = 0
        self.mean_values = 0
        self.std_values = 0
        self.prior_class_probabilities = 0
        self.nr_of_classes = 0
        self.nr_of_attributes = 0
        
    def fit(self, train_set, in_ensemble=False, nr_of_classes = 0,
            nr_of_attributes=0):
        self.dataset = train_set
        
        # calculating mean, std values and prior probabilities
        
        # if not in ensemble, we calculate these values
        if not in_ensemble:
            self.nr_of_classes=train_set.groupby('class').ngroups
            self.nr_of_attributes=len(train_set.iloc[0,:-1])
        else:
            self.nr_of_classes = nr_of_classes
            self.nr_of_attributes = nr_of_attributes
        
        # Here we keep mean, std and prior class probability values
        self.mean_values = list()
        self.std_values = list()
        self.prior_class_probs = list()
        
        for i in range(self.nr_of_classes):
            class_values = train_set[train_set['class'] == i]
            prior_class_prob = len(class_values) / len(train_set)
            self.prior_class_probs.append(prior_class_prob)
            mean_values = list()
            std_values = list()   
            for j in range(self.nr_of_attributes):
                values = class_values.iloc[:,j].values.tolist()
                std, mean = self.std_and_mean(values)
                mean_values.append(mean)
                std_values.append(std)
            self.mean_values.append(mean_values)
            self.std_values.append(std_values)
            


    def mean(self, val_list):
        return sum(val_list) / len(val_list)
    
    def std_and_mean(self, val_list):
        mu = self.mean(val_list)
        std = sqrt(sum([(x - mu)**2 for x in val_list]) / (len(val_list)-1))
        return std, mu
        
    
    def gaussian_pdf(self, x, mu, sd):
        return (1 / (sd*sqrt(2*pi))) * exp(-0.5*((x-mu)/sd)**2)
    
    
    def pred_row(self, row, log_prob = False):
        probs = self.nr_of_classes*[0]  
        if log_prob:
            for i in range(self.nr_of_classes):
                probs[i] = log(self.prior_class_probs[i])
                for j in range(len(row)):
                    probs[i] += log(self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j]))
        
        else: 
            for i in range(self.nr_of_classes):
                probs[i] = self.prior_class_probs[i]
                for j in range(len(row)):
                    probs[i] *= self.gaussian_pdf(row[j], self.mean_values[i][j], self.std_values[i][j])
        
        prediction = probs.index(max(probs))
        return probs, prediction   
    
    def predict(self, test_set, log_prob = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()
        probabilities = list()
        corrects = 0
        for x, y in zip(X_test, Y_test):
            prob, pred = self.pred_row(x, log_prob)
            probabilities.append(prob)
            if pred == y:
                corrects += 1    
        accuracy = corrects / len(Y_test)
        return probabilities, accuracy

In [255]:
nbc = NaiveBayesClassifier()
nbc.fit(train_set)

In [257]:
probabilities, accuracy = nbc.predict(test_set)
print(accuracy)

0.9333333333333333


### Ensembled Classifier

In [290]:
class EnsembledNBClassifier():
    def __init__(self, nr_of_classes):
        self.classifiers = list()
        for i in range(nr_of_classes):
            self.classifiers.append(NaiveBayesClassifier())
    
    def fit(self, train_set, seed=None):
        self.nr_of_classes=train_set.groupby('class').ngroups
        self.nr_of_attributes=len(train_set.iloc[0,:-1])
        for cl in self.classifiers:
            bag = train_set.sample(frac=1, replace=True, random_state=seed).reset_index(drop=True)
            cl.fit(bag, True, self.nr_of_classes, self.nr_of_attributes)
        
    
    def predict(self, test_set, log_prob = False, majority_vote = False):
        X_test = test_set.iloc[:, :-1].values.tolist()
        Y_test = test_set.iloc[:, -1].values.tolist()

        corrects = 0
        for x, y in zip(X_test, Y_test):
            preds = list()
            probs = list()

            for cl in self.classifiers:
                prob, pred = cl.pred_row(x, log_prob)
                preds.append(pred)
                probs.append(prob)
            if majority_vote:
                prediction = max(set(preds), key = preds.count)
            else:
                if log_prob:           
                    agg_probs = [sum(x)/len(probs) for x in zip(*probs)]
                else:
                    agg_probs = self.nr_of_classes*[1]
                    for i in range(len(probs[0])):
                        for j in range(len(probs)):
                            agg_probs[i] *= probs[j][i]
                        agg_probs[i] = agg_probs[i]**(1/len(probs))
                    print("agg_probs", agg_probs)
                prediction = agg_probs.index(max(agg_probs))
            if prediction == y:
                corrects += 1
                
                
        accuracy = corrects / len(Y_test)
        return accuracy 
        
        

## Testing and validation

In [296]:
# Testing simple Naive Bayes Classifier
nbc = NaiveBayesClassifier()
nbc.fit(train_set)

_, accuracy = nbc.predict(test_set)
print("Accuracy", accuracy)

_, accuracy_log = nbc.predict(test_set, log_prob=True)
print("Accuracy with log", accuracy_log)

Accuracy 0.9333333333333333
Accuracy with log 0.9333333333333333


In [294]:
ens_nbc = EnsembledNBClassifier(10)
ens_nbc.fit(train_set)

accuracy = ens_nbc.predict(test_set)
print("Accuracy without log", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True)
print("Accuracy with log", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=False, majority_vote=True)
print("Accuracy with majority vote", accuracy)

accuracy = ens_nbc.predict(test_set, log_prob=True, majority_vote=True)
print("Accuracy with log, majority vote", accuracy)

agg_probs [1.9157695676670417, 3.4977221165893038e-19, 3.299040510336957e-27]
agg_probs [0.9313989305836752, 1.5190240218083504e-20, 9.03529068652365e-27]
agg_probs [0.0009847571288435434, 2.5794377381435236e-23, 4.4874800181531494e-27]
agg_probs [2.46145292645995, 5.134649200519097e-19, 3.7304969046409744e-26]
agg_probs [1.2626721263179725, 4.1841137794207576e-20, 2.76042823449782e-26]
agg_probs [0.014586182891401851, 5.275802932126866e-14, 8.203526507226078e-22]
agg_probs [0.4537379984346911, 2.9431892341491976e-16, 9.242627273148057e-24]
agg_probs [0.8719209038493538, 2.4925772737691313e-20, 7.70390078517229e-28]
agg_probs [0.8719209038493538, 2.4925772737691313e-20, 7.70390078517229e-28]
agg_probs [3.018114555394789, 2.1442121857769893e-19, 1.5528871426139556e-26]
agg_probs [0.010197279647599228, 1.2880159129091077e-16, 8.888334035358874e-23]
agg_probs [0.0, 0.006497933239253412, 0.0026152566311896525]
agg_probs [0.0, 0.116881532143028, 1.4140909626304866e-06]
agg_probs [0.0, 0.010

## 2. Implement and train Naïve Bayes classifiers

In [198]:
def func(x, y=2*x):
    return x + y

func(x=2)
        

NameError: name 'x' is not defined

In [206]:
# Should I make a more generalized class, or specified for this assignment?
class EnsempledNBClassifiers:
    def __init__(self, datasets):
        self.classifiers = list()
        for dataset in datasets:
            nbc = NaiveBayesClassifier(dataset)
            self.classifiers.append(nbc)
        
    def predict(self, test_set):
        sum_probs = 0
        predictions = test_set.apply(lambda x: f(x[:4]), axis=1).to_numpy()

    
        

In [208]:
nbc.predict([1,2,3,4])

test [[1.01615499e-028 1.02917199e-003 2.21118356e-017 1.10939129e-266]
 [1.07387771e-020 6.26298008e-002 2.33170738e-002 3.98989135e-040]
 [1.06694610e-017 1.29321441e-002 1.64439215e-005 8.80657963e-012]]
prob [2.56541398e-314 6.25706943e-063 1.99813890e-035]
class prob [0.33333333 0.33333333 0.33333333]
results [8.55137994e-315 2.08568981e-063 6.66046298e-036]


2

In [143]:
mu = nbc.mean_values
mu

array([[5.006, 3.418, 1.464, 0.244],
       [5.936, 2.77 , 4.26 , 1.326],
       [6.588, 2.974, 5.552, 2.026]])

In [156]:
sd = nbc.std_values
sd

array([[0.35248969, 0.3810244 , 0.17351116, 0.1072095 ],
       [0.51617115, 0.31379832, 0.46991098, 0.19775268],
       [0.63587959, 0.32249664, 0.5518947 , 0.27465006]])

In [167]:
sd[0,0]

0.3524896872134513

In [172]:
p = nbc.gaussian_pdf([1,2,3,4], mu[0,:], sd[0,:])
p

array([1.01615499e-028, 1.02917199e-003, 2.21118356e-017, 1.10939129e-266])

In [173]:
np.prod(p)

2.565413981e-314

In [299]:
def f(x):
    return np.log(x*x)

data = [[5, 10, 5], [5, 15, 9], [5, 14, 19]]

testdf = pd.DataFrame(data, columns = ['Name', 'Age', 'Trt'])

data2 = pd.DataFrame([1,2,3,4,5,6,7])
data3 = pd.DataFrame([7,6,3,4,3,2,7])

(data2 == data3).sum().values

sum(data2[0] == data3[0])

3

In [191]:
arr = 4*[0]
arr[1] = 5
arr

[0, 5, 0, 0]

In [289]:
data = [[5, 10, 5], [5, 15, 9], [5, 14, 19]]

agg = 3*[1]

for i in range(len(data[0])):
    for j in range(len(data)):
        agg[i] *= data[j][i]
    agg[i] = agg[i]**(1/len(data))

agg

[5.0, 12.80579164987494, 9.49121995802933]

In [285]:
27**(1/25)

1.1409183116147532