In [23]:
import numpy as np
from scipy.spatial import distance
import pandas as pd
from scipy import stats
import operator
from sklearn.model_selection import KFold
from statistics import mean

np.set_printoptions(suppress=True) #prevent numpy exponential

def read_data(file):
    nominal = dict()
    gene_data = open(file)
    gene_seq = gene_data.readlines()
    all_genes_list = []
    first_row = gene_seq[0].split("\t")
    for f in range(len(first_row)):
        try:
            float(first_row[f])
        except:
            nominal[f] = []
    for line in gene_seq:
        gene = line.strip().split("\t")
        for f in range(len(gene)):
            try:
                gene[f] = float(gene[f])
            except:
                category = nominal[f]
                if gene[f] in category:
                    gene[f] = float(category.index(gene[f]))
                else:
                    category.append(gene[f])
                    gene[f] = float(category.index(gene[f]))
        all_genes_list.append(gene)
    return nominal, np.asarray(all_genes_list, dtype = float)

def compute_naive_bayes(train_data, test_data, train_classes, nominal_dict):
    class_dict = dict()
    stat_dict = dict()
    predicted_classes = []
    for i in range(len(train_classes)):
        class_dict.setdefault(train_classes[i],[]).append(train_data[i])
    for k in class_dict:
        mean = np.mean(class_dict[k], axis = 0)
        std = np.std(class_dict[k], axis = 0)
        stat_dict[k] = (mean, std)
    
    for i in range(len(test_data)):
        test_record = test_data[i]
        max_prob = -1
        label = -1
        for k in class_dict:
            prob = 1.0
            m = stat_dict[k][0]
            sigma = stat_dict[k][1]
            for feature in range(len(test_record)):
                if feature in nominal_dict:
                    print()
                else:
                    prob *= stats.norm(m[feature], sigma[feature]).pdf(test_record[feature])
            prob *= len(class_dict[k])/len(train_data)
            if prob > max_prob:
                max_prob = prob
                label = k
        predicted_classes.append(label)
    print(predicted_classes)
            
            

nominal_dict, data = read_data("project3_dataset2.txt")
classes = data[:,len(data[0])-1]
feature_data = data[:,:len(data[0])-1]
kfold = KFold(2, True, 1)
accuracy_list = []
precision_list = []
recall_list = []
f_1_measure_list = []
for train, test in kfold.split(feature_data):
    train_data = feature_data[train]
    test_data = feature_data[test]
    train_classes = classes[train]
    test_classes = classes[test]
    print(len(train_classes), len(test_classes))
    predicted_classes = compute_naive_bayes(train_data, test_data, train_classes, nominal_dict)
#     accuracy, precision, recall, f_1_measure = calculate_metrics(predicted_classes, test_classes)
#     accuracy_list.append(accuracy)
#     precision_list.append(precision)
#     recall_list.append(recall)
#     f_1_measure_list.append(f_1_measure)
# print("Accuracy: ", mean(accuracy_list))
# print("Precision: ", mean(precision_list))
# print("Recall: ", mean(recall_list))
# print("F_1_measure: ", mean(f_1_measure_list))

231 231














































































































































































































































































































































































































































































[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0,