REQUIRED LIBRARIES

In [123]:
import pandas as pd
import numpy as np
import math

DATASET

In [124]:
data = pd.read_csv("ML Lab 4 Data - Sheet1.csv")
data = data.drop(['Day'], axis=1)
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [125]:
def unique_vals(dataset, col):
    return set([row[col] for row in dataset])

In [126]:
header = data.columns
header

Index(['Outlook', 'Temperature', 'Humidity', 'Wind', 'Decision'], dtype='object')

In [127]:
data = data.values.tolist()

In [128]:
unique_vals(data,0)

{'Overcast', 'Rain', 'Sunny'}

In [129]:
def class_counts(dataset):
    counts = {}
    for row in dataset:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [130]:
class_counts(data)

{'No': 5, 'Yes': 9}

In [131]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [132]:
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [133]:
Question(2, 'Normal')

Is Humidity == Normal?

In [134]:
q = Question(0, 'Sunny')

In [135]:
eg = data[0]
q.match(eg)

True

In [136]:
def partition(datset, question):
    true_rows, false_rows = [], []
    for row in datset:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [137]:
true_rows, false_rows = partition(data, Question(2, 'Normal'))
true_rows

[['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes']]

In [138]:
false_rows

[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]

In [139]:
def gini(dataset):
    counts = class_counts(dataset)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(dataset))
        impurity -= prob_of_lbl**2
    return impurity

In [140]:
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [141]:
current_uncertainity = gini(data)
current_uncertainity

0.4591836734693877

In [142]:
current_uncertainity = gini(data)
true_rows, false_rows = partition(data, Question(0, 'Rain'))
info_gain(true_rows, false_rows, current_uncertainity)

0.0020408163265306367

In [143]:
true_rows, false_rows = partition(data, Question(0,'Sunny'))
info_gain(true_rows, false_rows, current_uncertainity)

0.0655328798185941

In [144]:
def find_best_split(dataset):
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(dataset)
    n_features = len(dataset[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in dataset])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(dataset, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [145]:
best_gain, best_question = find_best_split(data)

In [146]:
print(best_gain)
print(best_question)

0.10204081632653056
Is Outlook == Overcast?


In [147]:
class Leaf:
    def __init__(self, dataset):
        self.predictions = class_counts(dataset)

In [148]:
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [149]:
def build_tree(dataset):
    gain, question = find_best_split(dataset)
    if gain == 0:
        return Leaf(dataset)
    true_rows, false_rows = partition(dataset, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

In [150]:
def print_tree(node, spacing=""):
    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [151]:
my_tree = build_tree(data)

In [152]:
print_tree(my_tree)

Is Outlook == Overcast?
--> True:
  Predict {'Yes': 4}
--> False:
  Is Humidity == Normal?
  --> True:
    Is Wind == Strong?
    --> True:
      Is Temperature == Mild?
      --> True:
        Predict {'Yes': 1}
      --> False:
        Predict {'No': 1}
    --> False:
      Predict {'Yes': 3}
  --> False:
    Is Outlook == Sunny?
    --> True:
      Predict {'No': 3}
    --> False:
      Is Wind == Strong?
      --> True:
        Predict {'No': 1}
      --> False:
        Predict {'Yes': 1}


In [153]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [154]:
test_data = ['Sunny', 'Cool', 'High', 'Weak']
classify(test_data, my_tree)

{'No': 3}

In [155]:
test_data = ['Sunny', 'Cool', 'Normal', 'Weak']
classify(test_data, my_tree)

{'Yes': 3}