In [67]:
##Sheing Jing NG DECISION TREE 10/13/2016
##Objective: Perform machine learning experiments, supervised learning and implementation of 
##           decision tree algorithm on German Credit Data sets.
##Goals: To get a good prediction on "Creditability" after implementing decision tree training.

import pandas
import random
import math
from sklearn import cross_validation
from collections import Counter

#the class with functions to be used as entry points when
#either training (fit) or predicting (predict) with the
#decision tree algorithm
class DTree:
    def fit(self,predictor_columns_data,target_column_data):
        self.__root_node = DNode(predictor_columns_data,target_column_data)
        self.__root_node.train()
        
    def predict(self,df_of_new_examples):
        #apply the predict function to the whole series, one at a time, this returns the series with the return vals
        predictions = df_of_new_examples.apply(self.__root_node.predict,axis=1)
        return predictions
        
    def print_tree(self):
        self.__root_node.print_node()
        

#A class for representing non-leaf nodes in the decision tree
class DNode:
    
    #when we create this node, we pass it training examples to be used at this point
    #the predictor columns of these training examples is in predictor_columns_data
    #the corresponding target values to those predictor columns are in target_column_data
    def __init__(self,predictor_columns_data,target_column_data):
        
        self.__attribute = ''  #the attribute used to sort examples at this node
        self.__predictor_columns = predictor_columns_data #the training examples that have been sorted to this node
        self.__target_column = target_column_data #the corresponding target values for the training examples
        self.__child_nodes = {} #dictionary of the child nodes of this node, indexed by the value they have for self.__attribute
        self.__most_common_value_here = '' #for keeping track of which target value is most common among the examples at this node. This is used to make a decision when there's no appropriate child node to follow
        
    #this should use the training data to determine the best attribute to use
    #as is, it just chooses one at random, but you will fix it to use information gain
    def choose_attribute(self):
        self.__attribute = random.choice(self.__predictor_columns.columns.values) #what a terrible way to choose the attribute!
        
               
    def train(self):
        self.choose_attribute() #'best' attribute at this node
        
        #in case we need to make a decision here because we don't have any children with a particular attribute value    
        self.__most_common_value_here = self.__target_column.value_counts().idxmax()
        
        #gets all the values that these examples have in our chosen column
        attribute_values_here = self.__predictor_columns[self.__attribute].unique()

        #going through all possible values this attribute can have
        #and creating the appropriate child node
        for value in attribute_values_here: 
             
            #the subset of examples with the given value
            examples_for_child_predictor_cols = self.__predictor_columns[self.__predictor_columns[self.__attribute] == value] 
            examples_for_child_target_col = self.__target_column[self.__predictor_columns[self.__attribute] == value] #target values corresponding to the subset of examples with the given value
            
            #we grabbed the values from the examples themselves, so there should
            #be at least one example that has each value, but just in case there isn't
            #I don't want to crash the program
            if examples_for_child_target_col.empty:
                print("error: we shouldn't get here")
                
            #there are no columns left to use for decisions at the child
            #so lets make a leage node based on the most common target value in those examples
            elif len(examples_for_child_predictor_cols.columns.values) == 1:  
                #create a child with the most common target value here
                leaf_child = DLeaf( self.__most_common_value_here )
                self.__child_nodes[value] = leaf_child
                
            #if all child examples have the same target value, we make a leaf node
            elif len(examples_for_child_target_col.unique()) == 1: #all child examples have same class
                leaf_child = DLeaf( examples_for_child_target_col.unique()[0] ) #make leaf with that class
                self.__child_nodes[value] = leaf_child #put the leaf in the dictionary of children nodes
                
            else: #we have a regular decision node for this attribute value
                #get rid of the column for this attribute so it can't be selected again
                examples_for_child_predictor_cols = examples_for_child_predictor_cols.drop(self.__attribute,1) 
                
                new_child = DNode(examples_for_child_predictor_cols,examples_for_child_target_col)
                new_child.train() #generate the rest of the subtree for this child
                self.__child_nodes[value] = new_child #put the new child node in the dictionary of children nodes
            

    #print out the tree - not the prettiest, but you can see it.
    def print_node(self,num_indents = 0):
        for i in range(num_indents): 
            print(" ",end=''), #print with no newline
        print(self.__attribute)
        for attr in self.__child_nodes.keys():
            for i in range(num_indents): 
                print("|", end='')
            print(":"+attr)
            self.__child_nodes[attr].print_node(num_indents+1)
            
    #make a prediction for a single new example
    #this only makes sense to call after the tree has been build (with train())
    def predict(self,new_example):
        #look up the right branch in our dictionary of children
        if new_example[self.__attribute] in self.__child_nodes:
            node_on_corresponding_branch = self.__child_nodes[new_example[self.__attribute]]
            return node_on_corresponding_branch.predict(new_example) #recursively call predict on the child node
        else:
            return self.__most_common_value_here #there was no child, so we predict the most common class of the examples at this node
        
#class for representing a leaf node in the tree
class DLeaf:
    
    #when we create the node, all we need to know is what we're going to predict if we get here
    def __init__(self,val_in_target_col):
        self.__target_value = val_in_target_col
    
    #just returns the prediction for a new example, 
    #this was probably called from predict() of a regular node one level up in the tree
    def predict(self,new_example):
        return self.__target_value
    
    #for displaying the tree
    def print_node(self,num_indents = 0):
        for i in range(num_indents): 
            print(" ",end='')
        print("LEAF:",self.__target_value)
        
    
#simply compares two Pandas series and returns the proportion that match
#this can be used to compute the accuracy of the prediction list against
#the actual target column
def accuracy(series1, series2):
    correct = 0.0
    for index, value in series1.iteritems():
        if value == series2.loc[index]:
            correct += 1
    return (correct/len(series1))
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
###Question1:
###(10 points) In class, an example was shown for how to use the decision tree, using only 3 columns for prediction. Update it to use all of the categorical columns. 
###Discuss what you did and record how well the algorithm performs in your write-up (be specific - what was the actual accuracy, not a vague opinion of how well you thought it did or "about" what the accuracy was).
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


import math
credit_data= pandas.read_csv('german_credit.csv')

train_data, test_data = cross_validation.train_test_split(credit_data,test_size = 0.1)
    
attributes_to_use=['Status of existing checking account','Credit history',\
                   'Purpose','Savings account/bonds','Present employment since',\
                    'Personal status and sex','Other debtors / guarantors',\
                    'Property','Other installment plans','Housing',\
                    'Job','Telephone','foreign worker',\
                    ]
                    

my_tree = DTree() 
my_tree.fit(train_data[attributes_to_use],train_data['Creditability'])
my_tree.print_tree()
predictions = my_tree.predict(test_data[attributes_to_use])
print(accuracy(test_data['Creditability'],predictions))


Job
:A174
 Present employment since
|:A72
  Purpose
||:A40
   Status of existing checking account
|||:A11
    LEAF: 2
|||:A14
    LEAF: 1
|||:A12
    LEAF: 2
||:A42
   Telephone
|||:A192
    Property
||||:A122
     LEAF: 1
||||:A124
     LEAF: 2
||||:A123
     LEAF: 1
|||:A191
    LEAF: 1
||:A41
   Status of existing checking account
|||:A14
    LEAF: 1
|||:A12
    LEAF: 2
||:A43
   Personal status and sex
|||:A93
    LEAF: 2
|||:A94
    LEAF: 1
|||:A92
    LEAF: 1
|:A74
  Telephone
||:A192
   Status of existing checking account
|||:A11
    Personal status and sex
||||:A93
     LEAF: 1
||||:A94
     LEAF: 1
||||:A92
     LEAF: 2
|||:A14
    LEAF: 1
|||:A12
    Housing
||||:A151
     LEAF: 1
||||:A152
     Personal status and sex
|||||:A93
      LEAF: 2
|||||:A91
      LEAF: 1
|||||:A92
      LEAF: 1
||:A191
   LEAF: 1
|:A75
  Purpose
||:A40
   Other debtors / guarantors
|||:A101
    Telephone
||||:A192
     Status of existing checking account
|||||:A11
      Savings account/bonds
|||||

In [95]:
import pandas
import random
import math
from sklearn import cross_validation
from collections import Counter

#the class with functions to be used as entry points when
#either training (fit) or predicting (predict) with the
#decision tree algorithm
class DTree:
    def fit(self,Data,predictor_columns_data,target_column_data):
        self.__root_node = DNode(predictor_columns_data,target_column_data)
        self.__root_node.train()
        
    def predict(self,df_of_new_examples):
        #apply the predict function to the whole series, one at a time, this returns the series with the return vals
        predictions = df_of_new_examples.apply(self.__root_node.predict,axis=1)
        return predictions
        
    def print_tree(self):
        self.__root_node.print_node()
        

#A class for representing non-leaf nodes in the decision tree
class DNode:
    
    #when we create this node, we pass it training examples to be used at this point
    #the predictor columns of these training examples is in predictor_columns_data
    #the corresponding target values to those predictor columns are in target_column_data
    def __init__(self,predictor_columns_data,target_column_data):
        
        self.__attribute = ''  #the attribute used to sort examples at this node
        self.__predictor_columns = predictor_columns_data #the training examples that have been sorted to this node
        self.__target_column = target_column_data #the corresponding target values for the training examples
        self.__child_nodes = {} #dictionary of the child nodes of this node, indexed by the value they have for self.__attribute
        self.__most_common_value_here = '' #for keeping track of which target value is most common among the examples at this node. This is used to make a decision when there's no appropriate child node to follow
        
    #this should use the training data to determine the best attribute to use
    #as is, it just chooses one at random, but you will fix it to use information gain
    def choose_attribute(self):
        #self.__attribute = random.choice(self.__predictor_columns.columns.values) #what a terrible way to choose the attribute!
        #Me,Gary,Yee Jun discussed and came out with this code. However, we have problems while integrating it into 
        #Choose_Attribute. It works perfectly fine when we run it seperately and the highest information gain attribute should be
        #Status of Existing Checking Account.
        
        
        min_exp=100
        for i in self.__predictor_columns:
            print(i)
            p_1 = Counter(subset1(i))
            p_2 = Counter(subset2(i))
            print(p_1)
            print(p_2)

            expected_entropy=0
    
            for k in p_1.keys():
                x = p_1.get(k)
                y = p_2.get(k)
                print(x)
                print(y)

                entropy = -(x/(x+y)) * math.log((x/(x+y)), 2) - (y/(x+y)) * math.log((y/(x+y)), 2)
                print(entropy)
                expected_entropy += ((x+y)/len(self.__target_column)) * entropy
                #expected_count += expected_entropy
    
                print(expected_entropy)
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@")
            if expected_entropy<min_exp:
                min_exp=expected_entropy
                print(min_exp)
                print("######################")
                self.__attribute=i
                print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
       
    """
    Oct09-12 2016
    My previous choose_attribute
    
   
    def choose_attribute(self):
            smallestExpEnt=100
            expEntropy=0
        #for i in a:
        #self.__attribute = random.choice(self.__predictor_columns.columns.values) #what a terrible way to choose the attribute!
            for i in self.__predictor_columns:
                exp_count=Counter(self.__predictor_columns[i]) #count the frequencies in the column data
        
                exp_values=exp_count.values() #Get the freq values in the column data
                print(exp_values)
                length_numerator=len(exp_values) #Length of [freq values] eg: {3,4,5}, length = 3
        
                expEntropy+=(length_numerator/len(self.__target_column))*(Entropy(self.__target_column))#calling entropy function
                print(expEntropy)
            if expEntropy < smallestExpEnt:
                smallestExpEnt=expEntropy
                self.__attribute=i
                print(self.__attribute)
                print("@@@@@@@@@@@@@")
                
    Comment: It works fine, but I didn't subset the attributes_to_use columns based on "Creditability"==1 or "Creditability==2",
    the concept of expected entropy is Wrong here.
    """            
                                        
    def train(self):
        self.choose_attribute() #'best' attribute at this node
        
        #in case we need to make a decision here because we don't have any children with a particular attribute value    
        self.__most_common_value_here = self.__target_column.value_counts().idxmax()
        
        #gets all the values that these examples have in our chosen column
        attribute_values_here = self.__predictor_columns[self.__attribute].unique()

        #going through all possible values this attribute can have
        #and creating the appropriate child node
        for value in attribute_values_here: 
             
            #the subset of examples with the given value
            examples_for_child_predictor_cols = self.__predictor_columns[self.__predictor_columns[self.__attribute] == value] 
            examples_for_child_target_col = self.__target_column[self.__predictor_columns[self.__attribute] == value] #target values corresponding to the subset of examples with the given value
            
            #we grabbed the values from the examples themselves, so there should
            #be at least one example that has each value, but just in case there isn't
            #I don't want to crash the program
            if examples_for_child_target_col.empty:
                print("error: we shouldn't get here")
                
            #there are no columns left to use for decisions at the child
            #so lets make a leage node based on the most common target value in those examples
            elif len(examples_for_child_predictor_cols.columns.values) == 1:  
                #create a child with the most common target value here
                leaf_child = DLeaf( self.__most_common_value_here )
                self.__child_nodes[value] = leaf_child
                
            #if all child examples have the same target value, we make a leaf node
            elif len(examples_for_child_target_col.unique()) == 1: #all child examples have same class
                leaf_child = DLeaf( examples_for_child_target_col.unique()[0] ) #make leaf with that class
                self.__child_nodes[value] = leaf_child #put the leaf in the dictionary of children nodes
                
            else: #we have a regular decision node for this attribute value
                #get rid of the column for this attribute so it can't be selected again
                examples_for_child_predictor_cols = examples_for_child_predictor_cols.drop(self.__attribute,1) 
                
                new_child = DNode(examples_for_child_predictor_cols,examples_for_child_target_col)
                new_child.train() #generate the rest of the subtree for this child
                self.__child_nodes[value] = new_child #put the new child node in the dictionary of children nodes
            

    #print out the tree - not the prettiest, but you can see it.
    def print_node(self,num_indents = 0):
        for i in range(num_indents): 
            print(" ",end=''), #print with no newline
        print(self.__attribute)
        for attr in self.__child_nodes.keys():
            for i in range(num_indents): 
                print("|", end='')
            print(":"+attr)
            self.__child_nodes[attr].print_node(num_indents+1)
            
    #make a prediction for a single new example
    #this only makes sense to call after the tree has been build (with train())
    def predict(self,new_example):
        #look up the right branch in our dictionary of children
        if new_example[self.__attribute] in self.__child_nodes:
            node_on_corresponding_branch = self.__child_nodes[new_example[self.__attribute]]
            return node_on_corresponding_branch.predict(new_example) #recursively call predict on the child node
        else:
            return self.__most_common_value_here #there was no child, so we predict the most common class of the examples at this node
        
#class for representing a leaf node in the tree
class DLeaf:
    
    #when we create the node, all we need to know is what we're going to predict if we get here
    def __init__(self,val_in_target_col):
        self.__target_value = val_in_target_col
    
    #just returns the prediction for a new example, 
    #this was probably called from predict() of a regular node one level up in the tree
    def predict(self,new_example):
        return self.__target_value
    
    #for displaying the tree
    def print_node(self,num_indents = 0):
        for i in range(num_indents): 
            print(" ",end='')
        print("LEAF:",self.__target_value)
        
    
#simply compares two Pandas series and returns the proportion that match
#this can be used to compute the accuracy of the prediction list against
#the actual target column
def accuracy(series1, series2):
    correct = 0.0
    for index, value in series1.iteritems():
        if value == series2.loc[index]:
            correct += 1
    return (correct/len(series1))

##Reference from: https://rosettacode.org/wiki/Entropy

def Entropy(self):
    #Email Reply from Prof.Manley.
    #Expected entropy is something you should pass an attribute column to,
    #but the expected_entropy function should also call your entropy function,
    #so you’d need to pass both a regular column and the target column to expected_entropy
    entropy=0  
    p, lens = Counter(self),float(len(self))        
    #p.values() contains dict_values of Counter. 
    #Example: counter results = [a:3,b:41,c:500], p.values() will print out [3,41,500]
    return -sum( count/lens * math.log(count/lens, 2)for count in p.values())
    ## -sum(3/(3+41+500)+ln2(3/(3+41+500))
    
def subset1(self):
    train1=train_data[train_data.Creditability==1]
    
    return train1[self]

def subset2(self):
    train2=train_data[train_data.Creditability==2]
    return train2[self]
#'''

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#####Question2:
###(15 points) Enhance the decision tree code so that the best attribute is selected based on the attribute with the highest information gain (rather than randomly as it is in the starter code).
###Discuss what you did and record the performance in your write-up.#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


import math
credit_data= pandas.read_csv('german_credit.csv')

train_data, test_data = cross_validation.train_test_split(credit_data,test_size = 0.1)
    
attributes_to_use=['Status of existing checking account','Credit history',\
                   'Purpose','Savings account/bonds','Present employment since',\
                    'Personal status and sex','Other debtors / guarantors',\
                    'Property','Other installment plans','Housing',\
                    'Job','Telephone','foreign worker',\
                    ]
                    

my_tree = DTree() 
my_tree.fit(train_data,train_data[attributes_to_use],train_data['Creditability'])
#my_tree.print_tree()
predictions = my_tree.predict(test_data[attributes_to_use])
print(accuracy(test_data['Creditability'],predictions))

#print(subset1('Job'))

#print(subset2(train_data['Job']))





Status of existing checking account
Counter({'A14': 318, 'A12': 142, 'A11': 125, 'A13': 45})
Counter({'A11': 120, 'A12': 95, 'A14': 42, 'A13': 13})
125
120
0.999699542856517
0.2721404311109407
@@@@@@@@@@@@@@@@@@@@@@@@@@
318
42
0.5197027865043055
0.4800215457126629
@@@@@@@@@@@@@@@@@@@@@@@@@@
45
13
0.7676515870125797
0.5294924257645848
@@@@@@@@@@@@@@@@@@@@@@@@@@
142
95
0.9714420937881119
0.7853055104621209
@@@@@@@@@@@@@@@@@@@@@@@@@@
0.7853055104621209
######################
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Credit history
Counter({'A32': 328, 'A34': 214, 'A33': 53, 'A31': 20, 'A30': 15})
Counter({'A32': 151, 'A34': 44, 'A33': 26, 'A31': 26, 'A30': 23})
53
26
0.9140185106642176
0.0802305137138591
@@@@@@@@@@@@@@@@@@@@@@@@@@
20
26
0.9876925088958034
0.1307125752796446
@@@@@@@@@@@@@@@@@@@@@@@@@@
15
23
0.9677884628267679
0.17157475482121926
@@@@@@@@@@@@@@@@@@@@@@@@@@
328
151
0.8991299184860375
0.6501116781043437
@@@@@@@@@@@@@@@@@@@@@@@@@@
214
44
0.65

KeyError: ''

In [114]:
import pandas
import random
import math
from sklearn import cross_validation
from collections import Counter

#the class with functions to be used as entry points when
#either training (fit) or predicting (predict) with the
#decision tree algorithm
class DTree:
    def fit(self,Data,predictor_columns_data,target_column_data):
        self.__root_node = DNode(predictor_columns_data,target_column_data)
        self.__root_node.train()
        
    def predict(self,df_of_new_examples):
        #apply the predict function to the whole series, one at a time, this returns the series with the return vals
        predictions = df_of_new_examples.apply(self.__root_node.predict,axis=1)
        return predictions
        
    def print_tree(self):
        self.__root_node.print_node()
        

#A class for representing non-leaf nodes in the decision tree
class DNode:
    
    #when we create this node, we pass it training examples to be used at this point
    #the predictor columns of these training examples is in predictor_columns_data
    #the corresponding target values to those predictor columns are in target_column_data
    def __init__(self,predictor_columns_data,target_column_data):
        
        self.__attribute = ''  #the attribute used to sort examples at this node
        self.__predictor_columns = predictor_columns_data #the training examples that have been sorted to this node
        self.__target_column = target_column_data #the corresponding target values for the training examples
        self.__child_nodes = {} #dictionary of the child nodes of this node, indexed by the value they have for self.__attribute
        self.__most_common_value_here = '' #for keeping track of which target value is most common among the examples at this node. This is used to make a decision when there's no appropriate child node to follow
        
    #this should use the training data to determine the best attribute to use
    #as is, it just chooses one at random, but you will fix it to use information gain
    

    #Oct09-12 2016
    #My previous choose_attribute
    
   
    def choose_attribute(self):
            smallestExpEnt=100
            expEntropy=0
        #for i in a:
        #self.__attribute = random.choice(self.__predictor_columns.columns.values) #what a terrible way to choose the attribute!
            for i in self.__predictor_columns:
                exp_count=Counter(self.__predictor_columns[i]) #count the frequencies in the column data
        
                exp_values=exp_count.values() #Get the freq values in the column data
                print(exp_values)
                length_numerator=len(exp_values) #Length of [freq values] eg: {3,4,5}, length = 3
        
                expEntropy+=(length_numerator/len(self.__target_column))*(Entropy(self.__target_column))#calling entropy function
                print(expEntropy)
            if expEntropy < smallestExpEnt:
                smallestExpEnt=expEntropy
                self.__attribute=i
                print(self.__attribute)
                print("@@@@@@@@@@@@@")
                
    #Comment: It works fine, but I didn't subset the attributes_to_use columns based on "Creditability"==1 or "Creditability==2",
    #the concept of expected entropy is Wrong here.
              
                                        
    def train(self):
        self.choose_attribute() #'best' attribute at this node
        
        #in case we need to make a decision here because we don't have any children with a particular attribute value    
        self.__most_common_value_here = self.__target_column.value_counts().idxmax()
        
        #gets all the values that these examples have in our chosen column
        attribute_values_here = self.__predictor_columns[self.__attribute].unique()

        #going through all possible values this attribute can have
        #and creating the appropriate child node
        for value in attribute_values_here: 
             
            #the subset of examples with the given value
            examples_for_child_predictor_cols = self.__predictor_columns[self.__predictor_columns[self.__attribute] == value] 
            examples_for_child_target_col = self.__target_column[self.__predictor_columns[self.__attribute] == value] #target values corresponding to the subset of examples with the given value
            
            #we grabbed the values from the examples themselves, so there should
            #be at least one example that has each value, but just in case there isn't
            #I don't want to crash the program
            if examples_for_child_target_col.empty:
                print("error: we shouldn't get here")
                
            #there are no columns left to use for decisions at the child
            #so lets make a leage node based on the most common target value in those examples
            elif len(examples_for_child_predictor_cols.columns.values) == 1:  
                #create a child with the most common target value here
                leaf_child = DLeaf( self.__most_common_value_here )
                self.__child_nodes[value] = leaf_child
                
            #if all child examples have the same target value, we make a leaf node
            elif len(examples_for_child_target_col.unique()) == 1: #all child examples have same class
                leaf_child = DLeaf( examples_for_child_target_col.unique()[0] ) #make leaf with that class
                self.__child_nodes[value] = leaf_child #put the leaf in the dictionary of children nodes
                
            else: #we have a regular decision node for this attribute value
                #get rid of the column for this attribute so it can't be selected again
                examples_for_child_predictor_cols = examples_for_child_predictor_cols.drop(self.__attribute,1) 
                
                new_child = DNode(examples_for_child_predictor_cols,examples_for_child_target_col)
                new_child.train() #generate the rest of the subtree for this child
                self.__child_nodes[value] = new_child #put the new child node in the dictionary of children nodes
            

    #print out the tree - not the prettiest, but you can see it.
    def print_node(self,num_indents = 0):
        for i in range(num_indents): 
            print(" ",end=''), #print with no newline
        print(self.__attribute)
        for attr in self.__child_nodes.keys():
            for i in range(num_indents): 
                print("|", end='')
            print(":"+attr)
            self.__child_nodes[attr].print_node(num_indents+1)
            
    #make a prediction for a single new example
    #this only makes sense to call after the tree has been build (with train())
    def predict(self,new_example):
        #look up the right branch in our dictionary of children
        if new_example[self.__attribute] in self.__child_nodes:
            node_on_corresponding_branch = self.__child_nodes[new_example[self.__attribute]]
            return node_on_corresponding_branch.predict(new_example) #recursively call predict on the child node
        else:
            return self.__most_common_value_here #there was no child, so we predict the most common class of the examples at this node
        
#class for representing a leaf node in the tree
class DLeaf:
    
    #when we create the node, all we need to know is what we're going to predict if we get here
    def __init__(self,val_in_target_col):
        self.__target_value = val_in_target_col
    
    #just returns the prediction for a new example, 
    #this was probably called from predict() of a regular node one level up in the tree
    def predict(self,new_example):
        return self.__target_value
    
    #for displaying the tree
    def print_node(self,num_indents = 0):
        for i in range(num_indents): 
            print(" ",end='')
        print("LEAF:",self.__target_value)
        
    
#simply compares two Pandas series and returns the proportion that match
#this can be used to compute the accuracy of the prediction list against
#the actual target column
def accuracy(series1, series2):
    correct = 0.0
    for index, value in series1.iteritems():
        if value == series2.loc[index]:
            correct += 1
    return (correct/len(series1))

##Reference from: https://rosettacode.org/wiki/Entropy

def Entropy(self):
    #Email Reply from Prof.Manley.
    #Expected entropy is something you should pass an attribute column to,
    #but the expected_entropy function should also call your entropy function,
    #so you’d need to pass both a regular column and the target column to expected_entropy
    entropy=0  
    p, lens = Counter(self),float(len(self))        
    #p.values() contains dict_values of Counter. 
    #Example: counter results = [a:3,b:41,c:500], p.values() will print out [3,41,500]
    return -sum( count/lens * math.log(count/lens, 2)for count in p.values())
    ## -sum(3/(3+41+500)+ln2(3/(3+41+500))
    
def subset1(self):
    train1=train_data[train_data.Creditability==1]
    
    return train1[self]

def subset2(self):
    train2=train_data[train_data.Creditability==2]
    return train2[self]
#'''

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#####Question3:
#####Bin up the numerical columns and use them for training and prediction
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


import math
credit_data= pandas.read_csv('german_credit.csv')

train_data, test_data = cross_validation.train_test_split(credit_data,test_size = 0.1)
    
attributes_to_use=['Status of existing checking account','Duration in month','Credit history','Purpose','Credit amount','Savings account/bonds','Present employment since','Installment rate in percentage of disposable income','Personal status and sex','Other debtors / guarantors','Present residence since','Property','Age in years','Other installment plans','Housing','Number of existing credits at this bank','Job','Number of people being liable to provide maintenance for','Telephone','foreign worker']
   

for i in credit_data['Age in years']:
    if i>=65:
        i = '65 on up'
    elif i>=55:
        i = '55-64'
    elif i>=45:
        i = '45-54'
    elif i>=35:
        i = '35-44'
    elif i>=25:
        i = '25-34'
    else:
        i = '24 or younger'
        #print(i)

# print credit_data['Duration in month'].max()
# print credit_data['Duration in month'].min()
for i in credit_data['Duration in month']:
    if i>=65:
        i = '65 on up'
    elif i>=55:
        i = '55-64'
    elif i>=45:
        i = '45-54'
    elif i>=35:
        i = '35-44'
    elif i>=25:
        i = '25-34'
    elif i>=15:
        i = '15-24'
    elif i>5:
        i='6-14'
    else:
        i='5 or smaller'
        #print(i)
    # print credit_data['Credit amount'].max()
    # print credit_data['Credit amount'].min()
for i in credit_data['Credit amount']:
    if i >= 0.9 * credit_data['Credit amount'].max():
        i = (str(0.9 * credit_data['Credit amount'].max())) +" - "+str(credit_data['Credit amount'].max())
    elif i >= 0.8 * credit_data['Credit amount'].max():
        i = str(0.8 * credit_data['Credit amount'].max()) + " - " + str(0.9 * credit_data['Credit amount'].max())
    elif i >= 0.7 * credit_data['Credit amount'].max():
        i = str(0.7 * credit_data['Credit amount'].max()) + " - " + str(0.8 * credit_data['Credit amount'].max())
    elif i >= 0.6 * credit_data['Credit amount'].max():
        i = str(0.6 * credit_data['Credit amount'].max()) + " - " + str(0.7 * credit_data['Credit amount'].max())
    elif i >= 0.5 * credit_data['Credit amount'].max():
        i = str(0.5 * credit_data['Credit amount'].max()) + " - " + str(0.6 * credit_data['Credit amount'].max())
    elif i >= 0.4 * credit_data['Credit amount'].max():
        i = str(0.4 * credit_data['Credit amount'].max()) + " - " + str(0.5 * credit_data['Credit amount'].max())
    elif i >= 0.3 * credit_data['Credit amount'].max():
        i = str(0.3 * credit_data['Credit amount'].max()) + " - " + str(0.4 * credit_data['Credit amount'].max())
    elif i >= 0.2 * credit_data['Credit amount'].max():
        i = str(0.2 * credit_data['Credit amount'].max()) + " - " + str(0.3 * credit_data['Credit amount'].max())
    elif i >= 0.1 * credit_data['Credit amount'].max():
        i = str(0.1 * credit_data['Credit amount'].max()) + " - " + str(0.2 * credit_data['Credit amount'].max())
    else:
        i = str(0.0 * credit_data['Credit amount'].max()) + " - " + str(0.1 * credit_data['Credit amount'].max())
    print(i)
    # print credit_data['Installment rate in percentage of disposable income'].max()
    # print credit_data['Installment rate in percentage of disposable income'].min()
for i in credit_data['Installment rate in percentage of disposable income']:
    if i == 4:
        i = '4'
    elif i == 3:
        i = '3'
    elif i == 2:
        i = '2'
    else:
        i = '1'
    #print(i)
# print credit_data['Present residence since'].max()
# print credit_data['Present residence since'].min()
for i in credit_data['Present residence since']:
    if i == 4:
        i = '4'
    elif i == 3:
        i = '3'
    elif i == 2:
        i = '2'
    else:
        i = '1'
    #print(i)
# print credit_data['Number of existing credits at this bank'].max()
# print credit_data['Number of existing credits at this bank'].min()
for i in credit_data['Number of existing credits at this bank']:
    if i == 4:
        i = '4'
    elif i == 3:
        i = '3'
    elif i == 2:
        i = '2'
    else:
        i = '1'
    #print(i)
for i in credit_data['Number of people being liable to provide maintenance for']:
    if i==2:
        i='2'
    elif i==1:
        i='1'
for i in credit_data['Creditability']:
    if i==2:
        i='2'
    elif i==1:
        i='1'    
    #print(i)
#my_tree = DTree() 
#my_tree.fit(train_data,train_data[attributes_to_use],train_data['Creditability'])
#my_tree.print_tree()
print credit_data("Credit Amount")
predictions = my_tree.predict(test_data[attributes_to_use])
print(accuracy(test_data['Creditability'],predictions))
#print(subset1('Job'))

#print(subset2(train_data['Job']))





0.0 - 1842.4
5527.2 - 7369.6
1842.4 - 3684.8
7369.6 - 9212.0
3684.8 - 5527.2
7369.6 - 9212.0
1842.4 - 3684.8
5527.2 - 7369.6
1842.4 - 3684.8
3684.8 - 5527.2
0.0 - 1842.4
3684.8 - 5527.2
0.0 - 1842.4
0.0 - 1842.4
0.0 - 1842.4
0.0 - 1842.4
1842.4 - 3684.8
7369.6 - 9212.0
11054.4 - 12896.8
1842.4 - 3684.8
1842.4 - 3684.8
1842.4 - 3684.8
1842.4 - 3684.8
0.0 - 1842.4
1842.4 - 3684.8
0.0 - 1842.4
0.0 - 1842.4
0.0 - 1842.4
1842.4 - 3684.8
5527.2 - 7369.6
1842.4 - 3684.8
3684.8 - 5527.2
5527.2 - 7369.6
0.0 - 1842.4
0.0 - 1842.4
3684.8 - 5527.2
5527.2 - 7369.6
1842.4 - 3684.8
0.0 - 1842.4
0.0 - 1842.4
1842.4 - 3684.8
0.0 - 1842.4
5527.2 - 7369.6
5527.2 - 7369.6
5527.2 - 7369.6
0.0 - 1842.4
1842.4 - 3684.8
0.0 - 1842.4
5527.2 - 7369.6
1842.4 - 3684.8
1842.4 - 3684.8
5527.2 - 7369.6
0.0 - 1842.4
1842.4 - 3684.8
1842.4 - 3684.8
0.0 - 1842.4
5527.2 - 7369.6
9212.0 - 11054.4
1842.4 - 3684.8
5527.2 - 7369.6
0.0 - 1842.4
0.0 - 1842.4
1842.4 - 3684.8
12896.8 - 14739.2
1842.4 - 3684.8
3684.8 - 5527.2
18