In [21]:
import numpy as np
from scipy.spatial import distance
import pandas as pd
from scipy import stats
import operator
from sklearn.model_selection import KFold
from statistics import mean

np.set_printoptions(suppress=True) #prevent numpy exponential

def read_data(file):
    nominal = dict()
    gene_data = open(file)
    gene_seq = gene_data.readlines()
    all_genes_list = []
    first_row = gene_seq[0].split("\t")
    for f in range(len(first_row)):
        try:
            float(first_row[f])
        except:
            nominal[f] = []
    for line in gene_seq:
        gene = line.strip().split("\t")
        for f in range(len(gene)):
            try:
                gene[f] = float(gene[f])
            except:
                category = nominal[f]
                if gene[f] in category:
                    gene[f] = float(category.index(gene[f]))
                else:
                    category.append(gene[f])
                    gene[f] = float(category.index(gene[f]))
        all_genes_list.append(gene)
    return nominal, np.asarray(all_genes_list, dtype = float)

def compute_naive_bayes(train_data, test_data, train_classes, nominal_dict):
    class_dict = dict()
    stat_dict = dict()
    #predicted_classes = []
    for i in range(len(train_classes)):
        class_dict.setdefault(train_classes[i],[]).append(train_data[i])
    for k in class_dict:
        mean = np.mean(class_dict[k], axis = 0)
        std = np.std(class_dict[k], axis = 0)
        stat_dict[k] = (mean, std)
    
#     for i in range(len(test_data)):
    test_record = test_data
    max_prob = -1
    label = -1
    prob_dict = dict()
    den = 0.0
    for k in class_dict:
        prob = 1.0
        m = stat_dict[k][0]
        sigma = stat_dict[k][1]
        for feature in range(len(test_record)):
            if feature in nominal_dict:
                categories = nominal_dict[feature]
                count = list(np.asarray(class_dict[k])[:, feature]).count(categories.index(test_record[feature]))
                print("count for feature: ",test_record[feature], " is: ", count)
                prob *= count/len(class_dict[k])
            else:
                prob *= stats.norm(m[feature], sigma[feature]).pdf(test_record[feature])
        prob *= len(class_dict[k])/len(train_data)
        prob_dict[k] = prob
        den += prob
        #print("Posterior probability for class given test record: ",int(k), " is: ",prob)
        if prob > max_prob:
            max_prob = prob
            label = k
    predicted_class = label
    for c in prob_dict:
        print("Posterior probability for class given test record: ",int(c), " is: ",prob_dict[c]/den)
    print("predicted class for test record is: ",int(predicted_class))
    return predicted_class        
def calculate_metrics(predicted_classes, ground_truth):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(0, len(predicted_classes)):
        if(predicted_classes[i] == 1 and ground_truth[i] == 1):
            tp += 1
        elif(predicted_classes[i] == 1 and ground_truth[i] == 0):
            fp += 1
        elif(predicted_classes[i] == 0 and ground_truth[i] == 1):
            fn += 1
        else:
            tn += 1
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    if (tp+fp) != 0:
        precision = tp / (tp + fp)   
    if (tp+fn) != 0:
        recall = tp / (tp + fn)   
    if ((2 * tp) + fp + fn) != 0:
        f_1_measure = (2 * tp) / ((2 * tp) + fp + fn)    
    return accuracy, precision, recall, f_1_measure  

nominal_dict, data = read_data("project3_dataset4.txt")
classes = data[:,len(data[0])-1]
feature_data = data[:,:len(data[0])-1]
# kfold = KFold(10, True, 1)
# accuracy_list = []
# precision_list = []
# recall_list = []
# f_1_measure_list = []
# for train, test in kfold.split(feature_data):
train_data = feature_data
#test_data = feature_data[test]
train_classes = classes
#test_classes = classes[test]
print(len(train_classes))
test_data = input("Enter test record(comma seperated): ")
test_record = test_data.split(",")
predicted_classes = compute_naive_bayes(train_data, np.asarray(test_record), train_classes, nominal_dict)
# accuracy, precision, recall, f_1_measure = calculate_metrics(predicted_classes, test_classes)
# accuracy_list.append(accuracy)
# precision_list.append(precision)
# recall_list.append(recall)
# f_1_measure_list.append(f_1_measure)
# print("Accuracy: ", mean(accuracy_list))
# print("Precision: ", mean(precision_list))
# print("Recall: ", mean(recall_list))
# print("F_1_measure: ", mean(f_1_measure_list))

14
Enter test record(comma seperated): sunny,cool,high,weak
count for feature:  sunny  is:  3
count for feature:  cool  is:  1
count for feature:  high  is:  4
count for feature:  weak  is:  2
count for feature:  sunny  is:  2
count for feature:  cool  is:  3
count for feature:  high  is:  3
count for feature:  weak  is:  6
Posterior probability for class given test record:  0  is:  0.5644599303135889
Posterior probability for class given test record:  1  is:  0.4355400696864111
predicted class for test record is:  0
