In [4]:
import csv
import random
import math

def loadCsv(filename) -> [[]]:
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    r = 0
    dt = []
    for i in range(len(dataset)):
        if (i == 0):
            continue
        c = 0
        r+=1
        rows=[]
        for x in dataset[i]:
            if (c == 0):
                c+=1
                continue
            if (c == 11):
                if (x == 'benign'):
                    x = 0
                else:
                    x = 1
            rows.append(float(x))
            c+=1
        dt.append(rows)
    print(('Loaded data file {0} with {1} rows').format(filename, len(dt)))
    return dt


def splitDataset(dataset, splitRatio):
	trainSize = int(len(dataset) * splitRatio)
	trainSet = []
	copy = list(dataset)
	while len(trainSet) < trainSize:
		index = random.randrange(len(copy))
		trainSet.append(copy.pop(index))
	return [trainSet, copy]

def separateByClass(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
	return math.sqrt(variance)

def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
	del summaries[-1]
	return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.items():
		summaries[classValue] = summarize(instances)
	return summaries

def calculateProbability(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.items():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculateProbability(x, mean, stdev)
	return probabilities

def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.items():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def accuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i][-1] == predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

if (__name__ == '__main__'):
    dt = loadCsv("/Users/vigneshkumarthangarajan/Documents/255-Data-Mining/homework-1/file.txt")
    train, test = splitDataset(dt,0.7)
    print('train len: ' + str(len(train)) + 'test len: '+str(len(test)))
    summ = summarizeByClass(train)
    print(summ)
    predictions = getPredictions(summ, test)
    acc = accuracy(test, predictions)
    print(('Accuracy: {0}%').format(acc))

Loaded data file /Users/vigneshkumarthangarajan/Documents/255-Data-Mining/homework-1/file.txt with 683 rows
train len: 478test len: 205
{0.0: [(1140948.5777777778, 848772.4837441165), (2.9746031746031747, 1.7096548850877868), (1.3111111111111111, 0.8549427498080815), (1.4063492063492065, 0.9341151022028169), (1.3587301587301588, 0.9814578589346687), (2.107936507936508, 0.8145378094905574), (1.3142857142857143, 1.1618754253768468), (2.0984126984126985, 1.0587055044067568), (1.253968253968254, 0.9471420969713158), (1.0634920634920635, 0.4878464267403312)], 1.0: [(1028378.7423312883, 306064.68484477384), (7.141104294478527, 2.5600860822079357), (6.656441717791411, 2.7383431236680766), (6.58282208588957, 2.598232901202647), (5.52760736196319, 3.1958941318000265), (5.3128834355828225, 2.4101674353819873), (7.791411042944786, 3.0132505473101574), (6.061349693251533, 2.2185893959504948), (5.644171779141105, 3.4060240417206273), (2.6748466257668713, 2.581710208328569)]}
Accuracy: 96.5853658536