In [1]:
import numpy as np
from math import log

In [2]:
filename = "./spambase(1).data"
data = np.loadtxt(filename, delimiter=",")

import random

np.random.seed(0)
np.random.shuffle(data)

a= data.shape

X= data[:,:-1]
Y= data[:, -1]



X_train = X[:(round(2/3*a[0]))]
X_test = X[(round(2/3*a[0])):]
Y_train = Y[:(round(2/3*a[0]))]
Y_test = Y[(round(2/3*a[0])):]

print(np.shape(X_train))

YTrain = np.reshape(Y_train, (-1,1))
YTest = np.reshape(Y_test, (-1,1))

#Standardize the X_train
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0, ddof=1)
std_X_train = (X_train-mean)/std

#Standardize the X_test
std_X_test =  (X_test-mean)/std

Train = np.hstack((std_X_train, YTrain))
Test = np.hstack((std_X_test, YTest))

(3067, 57)


In [3]:
# make binary data of Train

std_X_train = np.where(std_X_train <= 0, 0, std_X_train)
std_X_train = np.where(std_X_train > 0, 1, std_X_train)

training_data = np.hstack((std_X_train, YTrain))

In [4]:
# make binary data of Testing

std_X_test = np.where(std_X_test <= 0, 0, std_X_test)
std_X_test = np.where(std_X_test > 0, 1, std_X_test)

testing_data = np.hstack((std_X_test, YTest))

In [5]:
def YCounts(data):
    Y_count = {}  
    for i in data:
        Y = i[-1]
        if Y not in Y_count:
            Y_count[Y] = 0
        Y_count[Y] += 1
    return Y_count

In [6]:
print(YCounts(training_data))

{1.0: 1237, 0.0: 1830}


In [7]:
def entropy(data):
    counts = YCounts(data)
    entropy = 1
    for i in counts:
        P = counts[i] / float(len(data))
        entropy -= P * log(P,2)
    return entropy

In [8]:
entropy(training_data)

1.9728628061025462

In [9]:
def left_right(data, attri, value):
    left =[]
    right =[]
    for i in data:
        if i[attri] >= value:
            left.append(i) 
        else:
            right.append(i)

    return left, right

In [10]:
def IG(data, attri, value):

    left = left_right(data, attri, value)[0]
    right = left_right(data, attri, value)[1]
    
    p = float(len(left)) / (len(left) + len(right))

    return ((entropy(data) - p * entropy(left) - (1 - p) * entropy(right)),left,right)

In [11]:
class DTL:
    attribute = None 
    value = None 
    left = None
    right = None
    IG = IG
    Y_count = None


    def __init__(self, attribute = None, value = None, left = None, right = None, IG=IG, Y_count = None):
        self.attribute = attribute
        self.value = value
        self.left = left
        self.right = right
        self.Y_count = Y_count
        self.IG = IG

    def tree_building(self, data):
        best_info_gain = 0
        best_attri = None
        best_left_right = None
        
        for attri in range(len(X[0])):
            for i in [0,1]:
                info_gain = self.IG(data, attri, i)[0]
                L = self.IG(data, attri, i)[1]
                R = self.IG(data, attri, i)[2]
                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_attri = (attri, i)
                    best_left_right = (L, R)

        if best_info_gain > 0:
            left = self.tree_building(best_left_right[0])
            right = self.tree_building(best_left_right[1])
            return DTL(attribute = best_attri[0], value = best_attri[1], left = left, right = right)
        else:
            return DTL(Y_count = YCounts(data))
        
    
    def classifier(self, decision_tree, testing):
        
        if decision_tree.Y_count == None:
            node = None            
            if testing[decision_tree.attribute] >= decision_tree.value: 
                node = decision_tree.left
            else : node = decision_tree.right  
            return self.classifier(node, testing)
            
        else:   
            return list(decision_tree.Y_count.keys())[0]

In [12]:
trained_decision_tree = DTL().tree_building(training_data)

In [13]:
Y_predicted = []
for i in testing_data:
    kk = DTL().classifier(trained_decision_tree, i)
    Y_predicted.append(kk)

In [14]:
Y_predicted = np.array(Y_predicted)
Y_predicted =np.reshape(Y_predicted, (-1,1))
Y_test_Y_predicted = np.hstack((YTest, Y_predicted))

In [15]:
TP=0
FP=0
FN=0
TN=0
for a,b in Y_test_Y_predicted:
    if a==0 and b==0:
        TP+=1
    elif a==0 and b==1:
        FN+=1
    elif a==1 and b==0:
        FP+=1
    else:
        TN+=1


print("TP: ", TP)
print("FP: ", FP)
print("FN: ", FN)
print("TN: ", TN)

Accuracy = (TP+TN)/(TP+FP+FN+TN)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F_measure = (2*Precision*Recall)/(Precision+Recall)

print("Precision: ", Precision)
print("Recall: ", Recall)
print("F_measure: ", F_measure)
print("Accuracy: ", Accuracy)

TP:  866
FP:  57
FN:  92
TN:  519
Precision:  0.9382448537378115
Recall:  0.9039665970772442
F_measure:  0.9207868155236577
Accuracy:  0.9028683181225554
