In [118]:
training_data=[
    ['Green', 3, 'Mango'],
    ['Yellow', 3, 'Mango'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Cherry'],
    ['Red', 1, 'Redish'],
    ['Yellow', 1, 'Lemon'],
    ['Yellow', 5, 'Banana'],
    ['Red', 3, 'Apple'],
]

In [119]:
# column labels
header = ['color', 'diameter', 'label']

In [120]:
# Defining a function to get details by unique column
def unique_vals(rows, col):
    """ Function to get details by unique column """
    return set([row[col] for row in rows])

##############################
#Demo:
#unique_vals(training_data, 0)
##############################

In [121]:
# Defining a function to get class count
def class_counts(rows):
    counts = {}
    """ Function to get number of counts for each label """
    for row in rows:
        # In our sample dataset, last column is our label
        label = row[-1]
        if label not in counts:
            counts[label] = 1
        else:
            counts[label] += 1
    return counts;

##################################
#Demo:
#class_counts(training_data)
##################################

In [122]:
# Defining a function to check numerical value
def is_numeric(value):
    """ Function to determine whether the value is numeric or not. """
    return isinstance(value, int) or isinstance(value, float)

##############
#Demo:
#is_numeric(3)
##############

In [123]:
class Question:
    """ A Question is used to partition a dataset.
        This class gets input column number(eg. 0, 1, 2) and column Value(eg. 'Green', 3)
        It has match function to get the value from example and compare with the question
    """
    def __init__(self, column, value):
        self.column = column
        self.value = value
    
    def match(self, example):
        val = example[self.column]
        if (is_numeric(val)):
            return val >= self.value
        else:
            return val == self.value
        
    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" %(header[self.column], condition, str(self.value))

In [124]:
### Defining Partition function
def partition(rows, question):
    """for each data in row, checks if the row matches the question.
       If so, then add it to true row else false row."""
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [125]:
### Defining Gini Impurity Function
def gini(rows):
    """Calculate Gini Impurity To list of rows."""
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity = impurity - prob_of_lbl ** 2
    return impurity

In [126]:
### Definining Information Gain Function
def information_gain(left, right, current_uncertainity):
    """Uncertainity of Starting Node - weighted impurity of Child nodes"""
    p = float(len(left))/(float(len(left)) + float(len(right)))
    return current_uncertainity - p * gini(left) - (1 - p) * gini(right)

In [127]:
### Definining Finding Best split
def find_best_split(rows):
    """ Finding the best split by iterating over each features and value """
    best_gain = 0
    best_question = None
    current_uncertainity = gini(rows)
    n_features = len(rows[0]) - 1
    for col in range(n_features):
        values = set(row[col] for row in rows)
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)
            if (len(true_rows) == 0 or len(false_rows) == 0):
                continue
            gain = information_gain(true_rows, false_rows, current_uncertainity)
            
            if gain > best_gain:
                best_question, best_gain = question, gain
    return best_gain, best_question
            
                
            

In [128]:
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [129]:
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [130]:
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

In [131]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict" + str(node.predictions))
        return
    print (spacing + str(node.question))
    
    print(spacing + '-->True:')
    print_tree(node.true_branch, spacing + "     ")
    
    print(spacing + '-->False:') 
    print_tree(node.false_branch, spacing + "     ")

In [132]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [133]:
def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str((int(counts[lbl]) / total) * 100) + ' %'
    return probs

In [134]:
if __name__ == "__main__":    
    
    my_tree = build_tree(training_data)
    
    print_tree(my_tree)
    
    testing_data = [
        ['Green', 3, 'Mango'],
        ['Yellow', 5, 'Mango'],
        ['Red', 1, 'cherry'],
        ['Red', 1, 'Grape'],
        ['Yellow', 2, 'Lemon']
    ]
    
    for row in testing_data:
        print("Actual: %s. Predicted: %s " %(row[-1], print_leaf(classify(row, my_tree))))

Is color == Red?
-->True:
     Is diameter >= 3?
     -->True:
          Predict{'Apple': 1}
     -->False:
          Predict{'Grape': 1, 'Cherry': 1, 'Redish': 1}
-->False:
     Is diameter >= 3?
     -->True:
          Is diameter >= 5?
          -->True:
               Predict{'Banana': 1}
          -->False:
               Predict{'Mango': 2}
     -->False:
          Predict{'Lemon': 1}
Actual: Mango. Predicted: {'Mango': '100.0 %'} 
Actual: Mango. Predicted: {'Banana': '100.0 %'} 
Actual: cherry. Predicted: {'Grape': '33.33333333333333 %', 'Cherry': '33.33333333333333 %', 'Redish': '33.33333333333333 %'} 
Actual: Grape. Predicted: {'Grape': '33.33333333333333 %', 'Cherry': '33.33333333333333 %', 'Redish': '33.33333333333333 %'} 
Actual: Lemon. Predicted: {'Lemon': '100.0 %'} 


In [113]:
my_tree = build_tree(training_data)    
print_tree(my_tree)

Is color == Red?
-->True:
     Is diameter >= 3?
     -->True:
          Predict{'Apple': 1}
     -->False:
          Predict{'Grape': 2}
-->False:
     Is diameter >= 3?
     -->True:
          Is diameter >= 5?
          -->True:
               Predict{'Banana': 1}
          -->False:
               Predict{'Mango': 2}
     -->False:
          Predict{'Lemon': 1}
