In [1]:
class NaiveBayes:
    def __init__(self, dataset, attr):
        self.dataset = dataset
        self.attr = attr
        self.classGroups = self.dataset.groupby('class').size()
        self.nClass = self.dataset['class'].unique()
        self.nProb = {}
        self.d = 0
        for attr in self.attr:
            self.d += len(self.dataset[attr].unique())
            
    def calcProb(self, column, name, className):
        if (name, className) in self.nProb:
            return self.nProb[(name,className)]
        group = self.dataset.groupby(['class', column]).size()
        try:
            x = (float(group[className][name])+1) / (float(self.classGroups[className]) + self.d)
        except:
            x = 1/(float(self.classGroups[className]) + self.d)
        self.nProb[(name, className)] = x
        return x
    
    def createDict(self, name):
        if name in self.nProb:
            return self.nProb[name]
        x = float(self.classGroups[name]) / float(len(self.dataset))
        self.nProb[name] = x
        return x
    
    def test(self, row):
        maxPP = 0
        res = None
        for name in self.nClass:
            tmp = self.createDict(name)
            for attr in self.attr:
                tmp *= self.calcProb(attr, row[attr], name)
            if tmp > maxPP:
                maxPP = tmp
                res = name
        return res
                

In [3]:
import pandas
from sklearn.model_selection import KFold
import random,time

random.seed(time.time())

car = "car.txt"
car_names = ['buying','maint','doors','persons','lug_boot','safety','class']

chess = "kr-vs-kp.data.txt"
chess_names = ['bkblk', 'bknwy', 'bkon8', 'bkona', 'bkspr', 'bkxbq', 'bkxcr', 'bkxwp', 'blxwp', 'bxqsq', 'cntxt', 'dsopp', 'dwipd', 'hdchk', 'katri', 'mulch', 'qxmsq', 'r2ar8', 'reskd', 'reskr', 'rimmx', 'rkxwp', 'rxmsq', 'simpl', 'skach', 'skewr', 'skrxp', 'spcop', 'stlmt', 'thrsk', 'wkcti', 'wkna8', 'wknck', 'wkovl', 'wkpos', 'wtoeg', 'class']

connect = "connect-4.txt"
connect_names = ['a1','a2','a3','a4','a5','a6','b1','b2','b3','b4','b5','b6','c1','c2','c3','c4','c5','c6','d1','d2','d3','d4','d5','d6','e1','e2','e3','e4','e5','e6','f1','f2','f3','f4','f5','f6','g1','g2','g3','g4','g5','g6','class']

mushroom = "agaricus-lepiota.txt"
mushroom_names = ['class','cap-shape','cap-surface','cap-color','bruises?','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring ','stalk-color-below-ring ','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']

nursery = "nursery.txt"
nursery_names = ['parents','has_nurs','form','children','housing','finance','social','health','class']

abalone = "abalone.txt"
abalone_names = ['class','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']

balance = "balance.txt"
balance_names = ['class','Left-Weight','Left-Distance','Right-Weight','Right-Distance']

dataset = connect
nameSet = connect_names

dataset = pandas.read_csv(dataset, names = nameSet)
# dataset = dataset.head(1000)
dataset_split = KFold(n_splits=10,random_state=random.randint(1,100),shuffle=True)
count = 1

avg = 0
for train, test in dataset_split.split(dataset):
    newNames = nameSet[:]
    newNames.remove('class')
    tree = NaiveBayes(dataset.iloc[train], newNames)
    testSet = dataset.iloc[test]
    valid, invalid, total = 0,0,0
    for index, row in testSet.iterrows():
        total += 1
        if tree.test(row) == row['class']:
            valid += 1
        else:
            invalid += 1
    
    print(str(count)+".Accuracy: "+str((valid/float(total))*100))
    avg += ((valid/float(total))*100)
    count += 1
    
avg /= count-1
print("@@@@@@@@@@@@@@@@@@@@@@@@@\nAverage Accuracy: "+str(avg))

1.Accuracy: 66.63706335109532
2.Accuracy: 24.40793368857312


KeyboardInterrupt: 