In [21]:
import numpy as np
import math
from random import randrange

fr = open("project3_dataset2.txt");
stringArr = [line.strip().split('\t') for line in fr.readlines()]
input_data = np.array(stringArr)


In [22]:
input_data.shape

(462, 10)

In [23]:
input_data

array([['132', '6.20', '6.47', ..., '14.14', '45', '0'],
       ['123', '0.05', '4.61', ..., '2.78', '16', '0'],
       ['128', '0.50', '3.70', ..., '22.73', '28', '0'],
       ...,
       ['138', '4.50', '2.85', ..., '24.89', '56', '1'],
       ['170', '7.60', '5.50', ..., '6.17', '54', '1'],
       ['128', '0.00', '10.58', ..., '14.66', '48', '0']], dtype='<U7')

In [24]:

def calculate_mean(numbers):
    return sum(numbers) / float(len(numbers))

def calculate_standard_deviation(numbers):
    avg = calculate_mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

def get_map(dataset):
    values = [(calculate_mean(attribute), calculate_standard_deviation(attribute)) for attribute in zip(*dataset)]
    return values

def dictionary(data):
    dict = {}
    for i in range(len(data)):
        vector = data[i]
        float_vector = vector[:-1].astype(np.float)
        if (vector[-1] not in dict):
            dict[vector[-1]] = []
        dict[vector[-1]].append(float_vector)
    return dict

def get_class_value(data):
    values = {}
    map = dictionary(data)
#             print(map)
    for classValue, instances in map.items():
        values[classValue] = get_map(instances)
    return values

def get_continuous_val(trainingdata,index):
        if len(index) > 0:
            for i in range(len(index)):
                trainingdata = np.delete(trainingdata,index[i],axis=1)
        value = get_class_value(trainingdata)
        return value

In [25]:
def get_continuous_probability(testdata,summary,index):
    if len(index) > 0:
        for i in range(len(index)):
            testdata = np.delete(testdata,index[i],axis=1)

    testdata = testdata[:,:-1].astype(np.float)
    continuous_matrix = []
    
    def classwiseprobability(input_vec, summary):
        probability = {}
        for classval, classSummary in summary.items():
            probability[classval] = 1
            for i in range(len(classSummary)):
                mean, standard_deviation = classSummary[i]
                x = input_vec[i]
                probability[classval] *= calculateProbability(x, mean, standard_deviation)
        return probability

    def calculateProbability(x, mean, standard_deviation):
        exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(standard_deviation, 2))))
        return (1 / (math.sqrt(2 * math.pi) * standard_deviation)) * exponent

    for i in range(len(testdata)):
        getprob = []
        input_vec = testdata[i]
        prob_map = classwiseprobability(input_vec,summary)
        getprob.append(prob_map['0'])
        getprob.append(prob_map['1'])
        continuous_matrix.append(getprob)

    return continuous_matrix


In [26]:
def get_categorical_val(trainingdata,testdata,index):
    if len(index) > 0 :
        final_catmatrix = 1
        for j in range(len(index)):
            train_data = trainingdata[:,index[j]]
            label_train = trainingdata[:,-1]
            dict = {}
            for i in range(len(label_train)):
                if label_train[i] not in dict:
                    dict[label_train[i]] = []
                dict[label_train[i]].append(train_data[i])
            cat_matrix1,prior = get_categorical_probability(testdata,dict,index[j])
            final_catmatrix *= cat_matrix1
        final_catmatrix = np.multiply(final_catmatrix,prior)
        return final_catmatrix
    else: return 1

def get_categorical_probability(testdata,summary,index):
    label = testdata[:,-1]
    testdata = testdata[:,index]
    categorical_matrix = []
    def get_classwise_probability(summary,input_vector,label):
        prob1 = {}
        for classval, data1 in summary.items():
            prob1[classval] = 1
            if classval not in prob1:
                prob1[classval] = []
            prior = len(summary[classval])/len(label)
            posterior = data1.count(input_vector) / len(summary[classval])
            prob1[classval] *= posterior
        return prob1,prior

    for i in range(len(testdata)):
        getprob = []
        input_vector = testdata[i]
        prob,prior = get_classwise_probability(summary,input_vector,label)
        getprob.append(prob['0'])
        getprob.append(prob['1'])
        categorical_matrix.append(getprob)

    return categorical_matrix,prior


In [27]:
def getindex(data):
    index = []
    for i in range(data.shape[1]):
        try:
            data[0,i].astype(np.float)
        except ValueError:
            index.append(i)
    return index

def getpredictedlabels(x):
    predicted_labels = []
    for i in range(len(x)):
        for j in range(len(x[i])):
            if x[i][j] == np.max(x[i]):
                predicted_labels.append(j)

    return predicted_labels

In [28]:
def naivebayes(trainingdata,testingdata):
    print(trainingdata)

    x = getindex(trainingdata)
#     print(x)
    value = get_continuous_val(trainingdata, x)
#     print(value)
    continuous_matrix = get_continuous_probability(testdata, value, x)
    print(continuous_matrix)
    get_categorical_value = get_categorical_val(trainingdata, testdata, x)
    predicted_matrix = continuous_matrix * get_categorical_value
    predicted_labels = getpredictedlabels(predicted_matrix)

    return predicted_labels

def calculate_performance_metrics(actual_values, predicted_values):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(len(actual_values)):

        if actual_values[i] == 1 and predicted_values[i] == 1:
            tp = tp + 1
        if actual_values[i] == 1 and predicted_values[i] == 0:
            fn = fn + 1
        if actual_values[i] == 0 and predicted_values[i] == 1:
            fp = fp + 1
        if actual_values[i] == 0 and predicted_values[i] == 0:
            tn = tn + 1
    precision = 0
    recall = 0
    F1 = 0
    accuracy = 0

    if tp + fp != 0:
        precision = tp / float(tp + fp)
    if tp + fn != 0:
        recall = tp / float(fn + tp)
    if tp + tn + fp + fn != 0:
        accuracy = (tp + tn) / float(tp + tn + fp + fn)
    if precision + recall != 0:
        F1 = (2 * precision * recall) / float(precision + recall)

    precision = precision * 100
    recall = recall * 100
    accuracy = accuracy * 100
    F1 = F1 * 100

    return {'precision': precision, 'recall': recall, 'accuracy': accuracy, 'F1': F1}

def convert_labels(actual_labels):
    final_actual_labels = []
    for i in range(len(actual_labels)):
        final_actual_labels.append(int(actual_labels[i]))

    return final_actual_labels

def naive_bayes_demo(dataset,testdata):
    print(testdata)
    def getindex(dataset):
        index = []
        for i in range(dataset.shape[1]):
            try:
                dataset[0,i].astype(np.float)
            except ValueError:
                index.append(i)
        return index

    traindata = dataset
    index = getindex(dataset)
    print(index)
    labels = list(dataset[:,-1])

    prior_0 = labels.count("0")/len(labels)
    prior_1 = labels.count("1")/len(labels)


    denominator = 1
    for i in range(len(testdata)):
        val = list(dataset[:,i])
        print(testdata[i])
        print(val)
        num = val.count(testdata[i])
        print(num)
        den = num/len(dataset)
        denominator = denominator*den
    post_0 = 1
    post_1 = 1
    for i in range(len(index)):
        train_data = traindata[:,index[i]]
        label_train = traindata[:, -1]
        dict = {}
        for j in range(len(label_train)):
            if label_train[j] not in dict:
                dict[label_train[j]] = []
            dict[label_train[j]].append(train_data[j])
        posterior_0 = dict['0'].count(testdata[i])/len(dict['0'])
        posterior_1 = dict['1'].count(testdata[i]) / len(dict['1'])
        post_0 = post_0 * posterior_0
        post_1 = post_1 * posterior_1
    print(prior_0)
    print(prior_1)
    print(post_0)
    print(post_1)
    prob_list = []
    prob_0 = (post_0*prior_0)/denominator
    prob_1 = (post_1*prior_1)/denominator
    prob_list.append(prob_0)
    prob_list.append(prob_1)
    val = prob_list.index(max(prob_list))

#     print("probability for class 0 : ", prob_0)
#     print("probability for class 1 : ", prob_1)
    print("X : ", testdata)
    print("p(H0|X) : ", prob_0)
    print("p(H1|X) : ", prob_1)
    print("The input will get classified to class : ", val)

    return prob_0,prob_1,val


In [250]:
testdata = ['sunny', 'cool', 'high','weak']
naive_bayes_demo(input_data, testdata)

['sunny', 'cool', 'high', 'weak']
[4]
sunny
['132', '123', '128', '114', '150', '136', '144', '134', '126', '164', '178', '136', '128', '146', '112', '174', '162', '216', '142', '160', '134', '138', '178', '118', '128', '176', '170', '146', '132', '150', '156', '138', '168', '168', '112', '110', '132', '136', '128', '108', '134', '130', '128', '130', '126', '114', '136', '148', '118', '116', '140', '138', '158', '140', '152', '136', '128', '128', '132', '162', '118', '132', '134', '124', '153', '122', '130', '134', '154', '154', '117', '124', '134', '148', '158', '154', '164', '114', '130', '134', '164', '142', '134', '136', '130', '136', '174', '132', '132', '132', '142', '146', '118', '128', '127', '158', '148', '128', '148', '120', '138', '116', '150', '134', '123', '160', '138', '121', '158', '168', '138', '114', '130', '156', '126', '128', '134', '124', '190', '124', '128', '130', '132', '126', '128', '160', '166', '124', '156', '130', '124', '200', '148', '120', '176', '124', '13

ZeroDivisionError: float division by zero

In [30]:
kfold_validation = 10
initial_set = 0

total_sets = len(input_data) // kfold_validation
remaining_sets = len(input_data) % kfold_validation
accuracy = []
precision = []
recall = []
fmeasure = []

for i in range(0,kfold_validation):

    testdata = input_data
    trainingdata = input_data
    test_labels = testdata[:,-1]
    final_set = initial_set + total_sets

    if i == kfold_validation-1:
        final_set = final_set + remaining_sets

    testdata = testdata[initial_set:final_set]
    test_labels = test_labels[initial_set:final_set]
    trainingdata = np.delete(trainingdata, np.s_[initial_set:final_set], axis=0)

    predicted_labels = naivebayes(trainingdata, testdata)
    actual = convert_labels(test_labels)
    metrics_dict = calculate_performance_metrics(actual, predicted_labels)
    
    print("\n ***  Fold "+str(i+1)+" Performance:***************")
    print('Accuracy:',metrics_dict['accuracy'], 'Precision:', metrics_dict['precision'],'Recall:', metrics_dict['recall'], 'F1-Score:', metrics_dict['F1'] ,'\n')

    recall.append(metrics_dict['recall'])
    precision.append(metrics_dict['precision'])
    fmeasure.append(metrics_dict['F1'])
    accuracy.append(metrics_dict['accuracy'])

    initial_set = final_set

mean_accuracy = np.mean(accuracy)
mean_precision = np.mean(precision)
mean_recall = np.mean(recall)
mean_fmeasure = np.mean(fmeasure)

print("Mean accuracy is : ",mean_accuracy)
print("Mean precision is : ",mean_precision)
print("Mean recall is : ",mean_recall)
print("Mean fmeasure is : ",mean_fmeasure)

[['136' '6.60' '6.08' ... '2.72' '49' '0']
 ['148' '5.50' '7.10' ... '3.60' '48' '0']
 ['118' '0.00' '3.89' ... '0.00' '16' '0']
 ...
 ['138' '4.50' '2.85' ... '24.89' '56' '1']
 ['170' '7.60' '5.50' ... '6.17' '54' '1']
 ['128' '0.00' '10.58' ... '14.66' '48' '0']]
[[1.1962439385947202e-12, 5.0403008627415286e-12], [2.904208118817051e-12, 4.236175476903895e-15], [1.9015022974445637e-12, 3.092055500595684e-14], [8.182930912548609e-13, 1.9664870690949102e-12], [2.5341173658184626e-12, 5.5511494494444165e-12], [2.541226192474397e-13, 1.0428269967296292e-12], [1.6379157296078963e-15, 9.269415724104541e-14], [7.056931590249152e-14, 5.982647621368937e-13], [9.377177986027666e-13, 3.553185318057814e-12], [1.1407808902606553e-12, 2.9951350845346893e-12], [1.372497435486478e-20, 3.496168521416386e-15], [1.1786183825055063e-11, 6.533655892826011e-13], [4.877507641999702e-12, 1.1711237288062618e-12], [1.8147873876671006e-11, 7.506607670515785e-12], [8.03583728758018e-13, 5.042767306525589e-13], 