In [6]:
training_data = [
    ['green', 3, 'mango'],
    ['yellow', 2, 'banana'],
    ['green', 3, 'apple'],
    ['red', 1, 'strawberry']
]

data_format = ['color', 'radius', 'name']

def unique_values(rows, column):
    return set([row[column] for row in rows])

def class_counts(rows):
    columns = {}
    for row in rows:
        label = row[-1]
        if label not in columns:
            columns[label] = 0
        else:
            columns[label] +=1
    return columns

def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

class Question():
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s" % (
            data_format[self.column], condition, str(self.value)
        )

def partition(rows, question):
    true, false = [], []
    for row in rows:
        if question.match(row):
            true.append(row)
        else:
            false.append(row)
    return true, false

def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob = counts[lbl]
        impurity -= prob**2
    return impurity

def info_gained(left, right, current):
    p = float(len(left))/ (len(left) + len(right))
    return current - p * gini(left) - (1-p)*gini(right)

def find_best(rows):
    best_gain = 0
    best_question = None
    current = gini(rows)
    n_features = len(rows[0])-1

    for col in range(n_features):
        values - set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            true , false = partition(rows, question)
            if len(true) == 0 or len(false) == 0:
                continue
            gain = info_gained(true, false, current)

            if gain >= best_gain:
                best_gain, best_question = gain, question
    return best_gain, best_question

class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

class Decision_Node:
    def __init__(self, question, true, false):
        self.question = question
        self.true_branch = true
        self.false_branch = false

def build_tree(rows):
    gain, question = find_best(rows)
    if gain == 0:
        return Leaf(rows)
    true, false = partition(rows, question)
    true_branch = build_tree(true)
    false_branch = build_tree(false)
    return Decision_Node(question, true_branch, false_branch)

def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict", node.predictions)
        return

    print(spacing + str(node.question))
    print(spacing+"--> True:")
    print_tree(node.true, spacing + "  ")
    print(spacing+"--> False:")
    print_tree(node.false, spacing + "  ")

def classify(row, node):
    if isinstance(node, Leaf)
        return node.prediction

    if node.question.match(row):
        return classify(row, node.true)
    else:
        return classify(row, node.false)

def print_leaf(rows):
    total = sum(rows.values()) * 1.0
    probs = {}
    for l in rows.keys():
        probs[l] = str(int(rows[l]/total*100)) + "%"
    return probs

# this exmaple should contain some test data



SyntaxError: ignored