In [138]:
import numpy as np
import math
from sklearn.model_selection import KFold
import random

In [139]:
#Read the data
file=open('hayes-roth.data','r')
rows=file.read()
rows=rows.split('\n')
#deleting unwanted dat strings
rows.pop()
print(rows[120:])
print('data has been loaded successfully')

['14,1,2,2,1,2', '38,2,1,1,4,3', '71,3,1,2,2,2', '43,3,2,2,4,3', '131,2,3,1,3,1', '17,2,1,1,2,1', '12,3,4,1,3,3', '44,1,1,4,3,3', '40,2,1,2,1,1', '90,1,2,1,2,2', '21,1,2,2,1,2', '9,3,1,1,2,1']
data has been loaded successfully


In [140]:
#data preprocessing
dataset=[]
for row in rows:
    row_data=list(map(int,row.split(',')))
    dataset.append(row_data) 
len_dataset=len(dataset)
print(dataset[120:])
print('data is preprocessed')

[[14, 1, 2, 2, 1, 2], [38, 2, 1, 1, 4, 3], [71, 3, 1, 2, 2, 2], [43, 3, 2, 2, 4, 3], [131, 2, 3, 1, 3, 1], [17, 2, 1, 1, 2, 1], [12, 3, 4, 1, 3, 3], [44, 1, 1, 4, 3, 3], [40, 2, 1, 2, 1, 1], [90, 1, 2, 1, 2, 2], [21, 1, 2, 2, 1, 2], [9, 3, 1, 1, 2, 1]]
data is preprocessed


In [141]:
#separate data by class
def separate_data_by_class(dataset):
    data_seg = {}
    for dt in dataset: 
        cls = dt[-1]
        if cls not in data_seg:
            data_seg[cls] = []
        data_seg[cls].append(dt)
    return data_seg

In [142]:
# data summarization
def summarize_data(dataset):
    summary = [(np.mean(col),np.std(col),len(col)) for col in zip(*dataset)]
    del summary[-1]
    return summary

In [143]:
# data summarizatioinstd by class
def summarizing_data_by_cls(dataset):
    data_seg = separate_data_by_class(dataset)
    data_summaries ={}
    for cls , row in data_seg.items():
        data_summaries[cls] = summarize_data(row)
    return data_summaries

In [144]:
# gaussian implementation
def gaussian(x,mean,std):
    try:
        gaussian=(1/(std*math.sqrt(2*math.pi)))*(math.exp(-(x-mean)**2/(2*std**2)))
        return gaussian
    except:
        return 0.01

In [145]:
# gaussian by class 
def gaussian_by_cls(data_summaries,row):
    probs={}
    no_of_rows= sum([data_summaries[x][0][2] for x in data_summaries])
    for cls,cls_summaries in data_summaries.items():
        probs[cls]= 1
        for i in range(len(cls_summaries)):
            probs[cls]+=math.log(gaussian(row[i],cls_summaries[i][0],cls_summaries[i][1]))
    return probs


In [146]:
# predicting cls row wise
def predict(cls_summaries, row):
    probs=gaussian_by_cls(cls_summaries,row)
    best_cls, best_prob = None ,-1
    for cls, prob in probs.items():
        if best_cls==None or prob>best_prob:
            best_cls,best_prob=cls,prob 
    return best_cls

In [147]:
#acuuracy of the value
def accuracy(actual,predict):
    crct_count= len([i for i,j in zip(actual,predict) if i==j])
    acc=crct_count/float(len(actual))*100
    return acc

In [148]:
#navie_bayes
def navie_bayes(train, test):
    cls_summaries = summarizing_data_by_cls(train)
    predictions=[]
    for r in test:
        predictions.append(predict(cls_summaries,row))
    return predictions

In [149]:
#K_folds
def K_fold_splits(dataset, no_of_splits):
    size_of_fold=int(len(dataset)/no_of_splits)
    copy_dataset= list(dataset)
    folds=[]
    for _ in range(no_of_splits):
        fold=[]
        while len(fold)<size_of_fold:
            r_idx= random.randrange(len(copy_dataset))
            fold.append(copy_dataset.pop(r_idx))
        folds.append(fold)
    folds[-1]+=copy_dataset
    return folds

In [150]:
print('dataset length:',len(dataset))
no_of_folds =10
print('In',no_of_folds,'folds test')
folds =K_fold_splits(dataset, no_of_folds)
accuracies =[]
for fold in folds:
    train = list(folds)
    train.remove (fold)
    train_set = []
    for idx in range(len(train)):
        train_set+= train[idx]
    actual_set= []
    for row in fold:
        actual_set.append(row[-1])
    test_set=[]
    for row in fold:
        test_set.append(row)
    print('length of train data:',len(train_set))
    print('length of test data:',len(test_set))
    predict_set =  navie_bayes(train_set, test_set)
    accuracies.append(accuracy(actual_set,predict_set))
print('model_accuracy:', np.mean(accuracies))

dataset length: 132
In 10 folds test
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 119
length of test data: 13
length of train data: 117
length of test data: 15
model_accuracy: 35.84615384615385
