In [1]:
from math import log
from collections import defaultdict
import json
import pprint
import pandas as pd
import numpy as np

In [6]:

def calculateEntropy(dataset):
    counter= defaultdict(int)   # number of unique labels and their frequency
    for record in dataset:      
        label = record[-1]      # always assuming last column is the label column 
        counter[label] += 1
    entropy = 0.0
    for key in counter:
        probability = counter[key]/len(dataset)           # len(dataSet) = number of entries   
        entropy -= probability * log(probability,2)       # log base 2
    return entropy

def splitDataset(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]     # chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataset):
    baseEntropy = calculateEntropy(dataset)
    bestInfoGain = 0.0; bestFeature = -1
    
    numFeat = len(dataset[0]) - 1          # do not include last label column     
    for indx in range(numFeat):            # iterate over all the features index
        featValues = {record[indx] for record in dataset}     # put feature values into a set
        featEntropy = 0.0
        for value in featValues:
            subDataset = splitDataset(dataset, indx, value)      # split based on feature index and value
            probability = len(subDataset)/float(len(dataset))
            featEntropy += probability * calculateEntropy(subDataset) # sum Entropy for all feature values

        infoGain = baseEntropy - featEntropy    # calculate the info gain; ie reduction in Entropy
        if infoGain > bestInfoGain:             # compare this to the best gain so far
            bestInfoGain = infoGain             # if better than current best, set it to best
            bestFeature = indx
    return bestFeature                          # return an best feature index


def createTree(dataset, features):
    labels = [record[-1] for record in dataset]
    
    # Terminating condition #1
    if labels.count(labels[0]) == len(labels):   # stop splitting when all of the labels are same
        return labels[0]            
    # Terminating condition #2
    if len(dataset[0]) == 1:                     # stop splitting when there are no more features in dataset
        mjcount = max(labels,key=labels.count)   # select majority count
        return (mjcount) 
    
    bestFeat = chooseBestFeatureToSplit(dataset)
    bestFeatLabel = features[bestFeat]
    featValues = {record[bestFeat] for record in dataset}     # put feature values into a set
    subLabels = features[:]             # make a copy of features
    del(subLabels[bestFeat])            # remove bestFeature from labels list
    
    myTree = {bestFeatLabel:{}}         # value is empty dict
    for value in featValues:
        subDataset = splitDataset(dataset, bestFeat, value)
        subTree = createTree(subDataset, subLabels)
        myTree[bestFeatLabel].update({value: subTree})  # add (key,val) item into empty dict
    return myTree                            


def predict(inputTree, features, testVec):
    
    def classify (inputTree, testDict):
        (key, subtree), = inputTree.items()
        testValue = testDict.pop(key)
        if len(testDict) == 0:
            return subtree[testValue]
        else:
            return classify(subtree[testValue], testDict)
            
    testDict = dict(zip(features, testVec))
    return classify(inputTree, testDict)
    

def pprintTree(tree):
    pprint.pprint (tree)
    tree_str = json.dumps(tree, indent=4)
    tree_str = tree_str.replace("\n    ", "\n")
    tree_str = tree_str.replace('"', "")
    tree_str = tree_str.replace(',', "")
    tree_str = tree_str.replace("{", "")
    tree_str = tree_str.replace("}", "")
    tree_str = tree_str.replace("    ", " | ")
    tree_str = tree_str.replace("  ", " ")    
    print (tree_str)


def createDataset():
    dataset = [[1, 1, 'yes'], [1, 1, 'yes'],
               [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no'],
               [1, 1, 'maybe'], [0, 0, 'maybe']]
    
    features = ['non-surfacing','flippers']
    label = ['isfish']
    return dataset, features


def roughthery(data, para):
    all = data
    all_1 = all[all['readmitted']==0].drop(['readmitted'],axis = 1)
    all_2 = all[all['readmitted']==1].drop(['readmitted'], axis = 1)
    all_1 = all_1.to_numpy()
    all_2 = all_2.to_numpy()
    score = np.zeros(all_1.shape[1])    
    for i in all_1:
        
        score = (~(i==all_2)).sum(0) + score
    
    select = score>((all_1.shape[0]*all_2.shape[0])/para)
    select = np.append(select, True)
    data = data.loc[:, select]
    return data
    
'''    
def main():

    data = pd.read_csv('all.csv')
    data = roughthery(data, 2)
    
    dataset = np.array(data).tolist()
    features = data.columns.tolist()
    print(data.shape)
    tree = createTree(dataset, features)
    pprintTree (tree) 
    
    #testVectors = [(0,0), (0,1),(1,0),(1,1)]
    #for vec in testVectors:
        #pred = predict(tree, features, vec)
        #print (pred, end =',')


main()
'''

"    \ndef main():\n\n    data = pd.read_csv('all.csv')\n    data = roughthery(data, 2)\n    \n    dataset = np.array(data).tolist()\n    features = data.columns.tolist()\n    print(data.shape)\n    tree = createTree(dataset, features)\n    pprintTree (tree) \n    \n    #testVectors = [(0,0), (0,1),(1,0),(1,1)]\n    #for vec in testVectors:\n        #pred = predict(tree, features, vec)\n        #print (pred, end =',')\n\n\nmain()\n"

In [9]:
from sklearn.model_selection import train_test_split

In [8]:
data = pd.read_csv('all.csv')
data = roughthery(data, 2)

train, test = train_test_split(data, test_size = 0.3, random_state = 30)




In [9]:
dataset = np.array(data).tolist()
features = data.columns.tolist()
print(data.shape)
tree = createTree(dataset, features)

testVectors = np.array(test)
i = 1
for vec in testVectors:
    i = i+1
    pred = predict(tree, features, vec)
    print (pred, end =',')
    if i == 10:
        break

(101763, 13)


AttributeError: 'int' object has no attribute 'items'

In [5]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
X, y = iris.data, iris.target
#clf = tree.DecisionTreeClassifier()
#clf = clf.fit(X, y)

In [9]:
data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)],axis = 1)

In [14]:
data.columns = ['f1','f2','f3','f4','target']

In [40]:
data

Unnamed: 0,f1,f2,f3,f4,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [62]:

from sys import exit, argv
import time

try: # attempt to import dependencies
    import pandas as pd # used to containerize examples
    import numpy as np # used for some mathematics
except ImportError: # if not found
    print("\nModules could not be loaded.")
    print("Ensure both `pandas` and `numpy` are installed before execution.\n")
    exit(1)


    
def id3(df, t, f):

    root, ig = {}, {} # root node, IG dict
    attr = df.columns.drop(t) # get attribute set of df
    for a in attr:
        ig[a] = find_information_gain(df, t, a) # find IG of attr
    highest_ig = max(ig, key=lambda key: ig[key]) # return key of highest val
    s = make_split(df, highest_ig) # find splits on highest IG attr
    root = {highest_ig:{}} # found root for further branches	
    for v in s.keys(): # for each outcome of root
        df_branch = df.where(df[highest_ig] == v).dropna() # remove root node
        # if entropy of potential branch is zero, all outcomes same = term leaf
        if find_entropy(df_branch[t]) == 0:
              # add leaf branch
            root[highest_ig][v] = s[v][t].value_counts().idxmax()
        else: # otherwise branch has further subbranches = decision
            if len(attr) - 1 == 0: # if no more attr to divide on
                # entropy not 0, next branch isn't pure, imperfect decision
                root[highest_ig][v] = s[v][t].value_counts().idxmax()
                return root
            else: # if more attr to split on, can recurse
                # recurse on split, dropping root attr
                root[highest_ig][v] = id3(s[v].drop(highest_ig, axis=1), t, f)
    return root

def find_entropy(t):#### ##

    h = 0
    v, n = np.unique(t, return_counts = True) # get values and distinct v
    for x in range(len(v)):
        px = n[x]/np.sum(n)
        h += -px * np.log2(px)
        
        
    
    return h


'''
def find_entropy(t):#######
    h = 0
    v, n = np.unique(t, return_counts = True) # get values and distinct v
    s = n.sum()
    for x in range(len(v)):
        px = (n[x]*(s-n[x]))/s
        
        h = px +h
        
    return h
'''
def find_information_gain(df, t, s):

    total_h = find_entropy(df[t]) # find entropy of entire system
    split_h = 0 # entropy after potential split
    v, n = np.unique(df[s], return_counts = True) # get values and distinct v
    for x in range(len(v)):
        pt = n[x]/np.sum(n)
        split = df.where(df[s] == v[x]).dropna()[t] # remove missing attrs
        split_h += pt * find_entropy(split)
    return total_h - split_h

def make_split(df, t):

    new_df = {}
    for df_key in df.groupby(t).groups.keys():
        new_df[df_key] = df.groupby(t).get_group(df_key)
    return new_df

def find_accuracy(dt, t):

    correct, total = 0, 0
    for _, e in t.iterrows():
        total += 1 # TP+TN+FP+FN
        if e[len(e)-1] == predict_decision(dt, e):
            correct += 1 # TP+TN
    return round(((correct/total)*100), 1)

def predict_decision(dt, e):

	split = list(dt.keys())[0]
	try:
		branch = dt[split][e[split]]
	except KeyError:
		# attribute not found in split
		return None
	if not isinstance(branch, dict): # terminal leaf node/decision
		return branch
	return predict_decision(branch, e) # recurse into sub-dict

def holdout(df, p):

	if 0.00 < p < 1.00:
		d = df.copy()
		train = d.sample(frac=p) # split, and randomize
		test = d.drop(train.index) # remove train data from df
		if len(test) == 0:
			print("Proportion of training examples is too high.\n")
			exit(1)
		return train, test
	else:
		print("\nThe proportion of training examples must be (0.00..1.00).\n")
		exit(1)

def count_leaves(dt, c=[0,0]):

	c[0] += 1
	leaves = dt.keys()
	for leaf in leaves:
		branches = dt[leaf].values()
		for branch in branches:
			if isinstance(branch, dict):
				count_leaves(branch, c)
			else:
				c[1] += 1
	return c

def print_tree(dt, indent=0):

	for key, value in dt.items():
		print("  " * indent + str(key))
		if isinstance(value, dict): # if subdict
			print_tree(value, indent+1)
		else: # otherwise value
			print("  " * (indent+1) + str(value))

def print_statistics(dt, t, tr, te, trs, tes):

	s, d = count_leaves(dt) # splits and decisions
	print(f"Using {trs} training examples and {tes} testing examples.")
	print(f"Tree contains {s} non-leaf nodes and {d} leaf nodes.")
	print("Took {:.2f} seconds to generate.".format(t))
	print(f"Was able to classify {tr}% of training data.")
	print(f"Was able to classify {te}% of testing data.\n")
	


def get_data(h, data):

	try: # singular set of examples
		h = float(h)
		train, test = holdout(data, h)
		print(f"\nUsing holdout style training, {h*100}% training data.")
	except ValueError: # separate train/test examples
		train, test = load_csv(argv[1]), load_csv(argv[2])
		print("\nUsing separate training and testing sets.")
	return train, test


print()

#data = pd.read_csv('./all.csv')
new_data = roughthery(data, 2)
train, test = get_data(0.7, data)
decision_name = train.columns[len(train.columns)-1]
start_time = time.time()
dt = id3(train, decision_name, train.columns[:-1]) # get decision tree
end_time = time.time();
'''
if int(argv[3]) == 1:
    print("\n", dt, "\n") # print decision tree as dict
    print_tree(dt) # print decision tree
    print()
'''
t = end_time-start_time
tr_size = len(train)
te_size = len(test)
tr_ability = find_accuracy(dt, train)
te_ability = find_accuracy(dt, test)
print_statistics(dt, t, tr_ability, te_ability, tr_size, te_size)




Using holdout style training, 70.0% training data.
Using 105 training examples and 45 testing examples.
Tree contains 3 non-leaf nodes and 46 leaf nodes.
Took 0.69 seconds to generate.
Was able to classify 100.0% of training data.
Was able to classify 75.6% of testing data.



In [36]:
def roughthery(data, para): 
    all = data
    all_1 = all[all['target']==0].drop(['target'],axis = 1)
    all_2 = all[all['target']==1].drop(['target'], axis = 1)
    all_1 = all_1.to_numpy()
    all_2 = all_2.to_numpy()
    score = np.zeros(all_1.shape[1])    
    for i in all_1:
        
        score = (~(i==all_2)).sum(0) + score
    
    select = score>((all_1.shape[0]*all_2.shape[0])/para)
    select = np.append(select, True)
    data = data.loc[:, select]
    return data

In [5]:
new_data.head(10)

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_inpatient,number_diagnoses,admission_source_id,diag_1,diag_2,diag_3,insulin,readmitted
0,5,1,41,0,1,0,1,2,1,7,7,1,0
1,15,3,59,0,18,0,9,0,7,1,7,3,1
2,25,2,11,5,13,1,6,0,7,1,7,1,0
3,35,2,44,1,16,0,7,0,7,1,0,3,0
4,45,1,51,0,8,0,5,0,6,6,1,2,0
5,55,3,31,6,16,0,9,2,0,0,1,2,1
6,65,4,70,1,21,0,7,2,0,0,7,2,0
7,75,5,73,0,12,0,8,0,0,8,1,1,1
8,85,13,68,2,28,0,8,1,0,0,7,2,0
9,95,12,33,3,18,0,8,1,0,6,8,2,0


In [10]:
train, test = train_test_split(new_data, test_size = 0.3)

In [15]:
print_tree(dt)

number_inpatient
  0
    number_diagnoses
      1
        num_lab_procedures
          3
            0
          11
            0
          12
            0
          13
            time_in_hospital
              2
                0
              4
                1
          17
            0
          22
            0
          24
            0
          26
            0
          28
            1
          31
            0
          32
            0
          33
            0
          34
            0
          35
            num_medications
              4
                1
              5
                0
              8
                0
          40
            age
              5
                1
              15
                0
          43
            0
          44
            0
          45
            0
          46
            0
          47
            num_medications
              2
                0
              3
                0
              5
                

                0
              16
                0
              17
                0
              20
                0
              21
                0
              23
                0
              27
                1
          24
            num_medications
              5
                0
              7
                0
              9
                0
              10
                age
                  15
                    0
                  75
                    1
              11
                0
              14
                0
              17
                0
              21
                0
          25
            num_medications
              2
                0
              8
                0
              11
                0
              12
                1
              13
                0
              14
                0
              16
                0
              18
                1
              22
                0
          26

                  7
                    0
              1
                time_in_hospital
                  1
                    0
                  2
                    0
                  3
                    1
                  4
                    0
                  5
                    age
                      75
                        1
                      85
                        0
                  7
                    0
                  10
                    1
              2
                1
              3
                1
              4
                0
              6
                1
              7
                0
              8
                1
          31
            num_medications
              5
                0
              6
                0
              7
                1
              8
                age
                  55
                    1
                  75
                    0
                  85
                    

              17
                0
              18
                age
                  55
                    0
                  65
                    1
                  75
                    0
              19
                age
                  55
                    1
                  65
                    1
                  75
                    0
              21
                age
                  55
                    1
                  75
                    0
              22
                0
              24
                0
              25
                0
              28
                age
                  35
                    1
                  55
                    0
              30
                1
          47
            num_medications
              2
                1
              3
                0
              6
                0
              7
                age
                  75
                    0
                  85
    

            num_medications
              2
                age
                  35
                    1
                  95
                    0
              3
                0
              5
                0
              7
                1
              8
                0
              9
                0
              10
                0
              11
                0
              12
                1
              13
                0
              18
                time_in_hospital
                  1
                    1
                  10
                    0
              20
                1
              23
                0
              25
                0
              26
                0
              35
                1
          29
            num_medications
              5
                0
              6
                age
                  65
                    1
                  85
                    0
              7
                1

                    1
                  75
                    1
              8
                0
              11
                age
                  45
                    0
                  65
                    1
              12
                1
              13
                0
          63
            num_medications
              5
                0
              8
                0
              10
                0
              11
                num_procedures
                  0
                    0
                  1
                    1
              12
                1
              13
                0
              14
                time_in_hospital
                  6
                    0
                  7
                    1
                  11
                    1
              15
                1
              16
                0
              17
                1
              18
                age
                  55
                    1


              4
                0
              6
                1
              7
                0
              8
                age
                  75
                    1
                  85
                    0
              9
                age
                  35
                    0
                  55
                    0
                  75
                    1
              10
                age
                  65
                    0
                  75
                    1
              11
                0
              12
                1
              13
                time_in_hospital
                  3
                    1
                  5
                    1
                  7
                    0
              14
                0
              15
                1
              16
                diag_1
                  0
                    0
                  2
                    0
                  4
                    1
      

                0
              25
                1
          30
            num_medications
              4
                0
              5
                1
              7
                0
              8
                0
              9
                num_procedures
                  0
                    0
                  1
                    1
                  2
                    0
              10
                age
                  35
                    1
                  45
                    1
                  55
                    0
                  75
                    0
                  85
                    0
              12
                0
              14
                0
              15
                time_in_hospital
                  1
                    1
                  2
                    0
                  3
                    1
              16
                1
              17
                1
              18
            

                diag_1
                  0
                    1
                  1
                    age
                      65
                        1
                      75
                        0
                  4
                    0
                  7
                    0
                  8
                    0
              6
                num_procedures
                  0
                    insulin
                      0
                        1
                      1
                        age
                          55
                            0
                          65
                            1
                          75
                            0
                          85
                            0
                      2
                        0
                  1
                    1
                  2
                    1
                  3
                    0
                  4
                    1
            

              14
                age
                  55
                    1
                  65
                    1
                  75
                    0
                  95
                    0
              15
                age
                  55
                    1
                  65
                    0
                  75
                    1
                  85
                    1
              16
                0
              17
                age
                  55
                    1
                  65
                    0
                  75
                    1
                  85
                    0
              19
                1
              21
                0
              22
                1
              24
                0
              28
                age
                  65
                    0
                  75
                    1
          23
            num_medications
              6
                0


                    1
              31
                1
              32
                1
          36
            num_medications
              3
                0
              5
                0
              6
                age
                  25
                    1
                  65
                    0
                  75
                    0
              7
                0
              8
                diag_3
                  0
                    num_procedures
                      0
                        1
                      4
                        0
                      6
                        0
                  2
                    1
                  3
                    0
                  6
                    0
                  7
                    0
              9
                time_in_hospital
                  1
                    0
                  7
                    1
                  11
                    1
            

                age
                  35
                    1
                  65
                    0
                  85
                    0
              27
                age
                  65
                    0
                  75
                    0
                  85
                    1
              28
                age
                  55
                    1
                  75
                    1
                  85
                    0
              29
                age
                  55
                    1
                  85
                    0
              30
                age
                  55
                    1
                  75
                    0
              31
                time_in_hospital
                  5
                    0
                  6
                    0
                  7
                    1
                  8
                    0
              32
                0
              34
   

                    0
              11
                diag_2
                  0
                    0
                  1
                    1
                  3
                    1
                  7
                    0
                  8
                    1
              12
                diag_2
                  0
                    0
                  1
                    0
                  7
                    1
                  8
                    1
              13
                time_in_hospital
                  1
                    1
                  2
                    0
                  3
                    age
                      65
                        0
                      85
                        1
                  4
                    0
                  6
                    0
                  8
                    0
              14
                num_procedures
                  0
                    1
                  1
    

                      3
                        0
                  9
                    age
                      55
                        0
                      85
                        1
              16
                age
                  55
                    0
                  65
                    0
                  85
                    1
                  95
                    1
              17
                num_procedures
                  0
                    0
                  1
                    time_in_hospital
                      3
                        1
                      4
                        0
                  2
                    1
                  3
                    0
                  4
                    1
              18
                diag_1
                  0
                    0
                  4
                    0
                  7
                    1
                  8
                    1
              

              33
                num_procedures
                  0
                    1
                  5
                    0
                  6
                    0
              34
                0
              36
                time_in_hospital
                  10
                    0
                  13
                    1
              64
                0
          76
            num_medications
              7
                0
              9
                0
              10
                1
              11
                age
                  35
                    0
                  75
                    1
                  85
                    0
              12
                age
                  25
                    1
                  55
                    1
                  85
                    0
              13
                age
                  55
                    0
                  65
                    0
                  85


                1
              75
                0
          21
            0
          23
            0
          28
            0
          42
            0
      12
        num_medications
          3
            0
          5
            0
          6
            1
          7
            1
          9
            age
              45
                0
              75
                1
              85
                1
          10
            time_in_hospital
              1
                0
              2
                0
              3
                1
              4
                0
          11
            age
              45
                1
              75
                0
          14
            age
              65
                0
              85
                1
          15
            0
          16
            age
              65
                0
              75
                1
          17
            0
          21
            1
          23


                0
          15
            time_in_hospital
              2
                number_diagnoses
                  4
                    1
                  9
                    0
              3
                age
                  45
                    0
                  65
                    0
                  85
                    1
              4
                0
              5
                1
              6
                1
              8
                age
                  65
                    1
                  85
                    0
              9
                1
              12
                1
              14
                1
          16
            time_in_hospital
              1
                0
              4
                1
          17
            number_diagnoses
              2
                1
              4
                1
              6
                1
              9
                0
          18
            a

                0
              2
                1
              3
                0
              4
                age
                  55
                    0
                  65
                    1
              5
                1
              6
                1
              8
                0
          21
            num_procedures
              0
                0
              1
                0
              2
                0
              4
                1
              5
                1
          22
            age
              45
                0
              55
                1
              65
                1
              75
                0
              95
                1
          23
            time_in_hospital
              3
                1
              13
                0
          24
            0
          25
            1
          26
            1
          27
            0
          28
            time_in_hospital
              4

              65
                1
              75
                1
              85
                0
              95
                1
          22
            time_in_hospital
              5
                1
              6
                0
          23
            1
          24
            0
          25
            time_in_hospital
              4
                0
              5
                1
              7
                0
              11
                0
          26
            0
          27
            1
          28
            1
          29
            age
              65
                0
              75
                1
              85
                0
          30
            age
              55
                1
              95
                0
          33
            0
          34
            0
          37
            0
          40
            0
      59
        num_medications
          3
            0
          4
            1
          

            age
              75
                1
              85
                0
          14
            age
              45
                0
              55
                1
          15
            1
          17
            1
          19
            0
          20
            1
          26
            1
      17
        diag_3
          0
            1
          1
            1
          2
            1
          3
            1
          5
            1
          6
            0
          7
            0
          8
            1
      18
        num_medications
          2
            0
          9
            age
              75
                0
              95
                1
          11
            1
          13
            1
          15
            0
          17
            0
          18
            0
          20
            1
          25
            0
          37
            0
      19
        age
          35
            1
          45
            1


                1
              85
                0
          11
            age
              55
                0
              75
                1
          12
            0
      68
        num_medications
          7
            0
          9
            0
          11
            time_in_hospital
              3
                0
              5
                1
              6
                0
          12
            1
          14
            1
          15
            1
          16
            0
          17
            time_in_hospital
              3
                0
              9
                1
              13
                1
          18
            age
              65
                0
              75
                1
          19
            time_in_hospital
              3
                0
              4
                1
              5
                1
              9
                0
          20
            1
          21
            1
        

          35
            1
          75
            0
      29
        age
          45
            1
          55
            1
          75
            0
      30
        num_medications
          5
            age
              45
                1
              65
                0
          10
            1
          11
            0
          12
            1
          13
            0
          14
            1
          17
            1
          21
            0
      31
        age
          45
            1
          55
            1
          65
            0
          75
            1
          85
            1
          95
            1
      32
        1
      33
        time_in_hospital
          1
            1
          3
            1
          6
            0
          7
            1
      34
        age
          45
            1
          75
            1
          85
            0
      35
        1
      36
        diag_3
          0
            1
          1
 