In [57]:
import pandas as pd
import numpy as np
import math

In [58]:
data = pd.read_csv("ML Lab 4 Data - Sheet1.csv")
data = data.drop(['Day'], axis=1)
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [59]:
def unique_vals(dataset, col):
    return set([row[col] for row in dataset])
header = data.columns
header

Index(['Outlook', 'Temperature', 'Humidity', 'Wind', 'Decision'], dtype='object')

In [60]:
data = data.values.tolist()

In [61]:
data

[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]

In [62]:
def class_counts(dataset):
    counts = {}
    for row in dataset:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [63]:
class_counts(data)

{'No': 5, 'Yes': 9}

In [64]:
x = class_counts(data)
print(x['Yes'])

9


In [65]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [66]:
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [67]:
def partition(datset, question):
    true_rows, false_rows = [], []
    for row in datset:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [70]:
def split_dataset(dataset, question):
    new_data = []
    for row in dataset:
        if question.match(row):
            new_data.append(row)
    new_data = pd.DataFrame(new_data)
    new_data.drop(new_data.columns[[question.column]], axis=1, inplace=True)
    new_data = new_data.values.tolist()
    return new_data

In [71]:
split_dataset(data, Question(0, 'Rain'))

[['Mild', 'High', 'Weak', 'Yes'],
 ['Cool', 'Normal', 'Weak', 'Yes'],
 ['Cool', 'Normal', 'Strong', 'No'],
 ['Mild', 'Normal', 'Weak', 'Yes'],
 ['Mild', 'High', 'Strong', 'No']]

In [72]:
data

[['Sunny', 'Hot', 'High', 'Weak', 'No'],
 ['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]

In [73]:
len(split_dataset(data, Question(0, 'Rain')))

5

In [74]:
z = split_dataset(data, Question(0, 'Rain'))
class_counts(z)

{'Yes': 3, 'No': 2}

In [75]:
def entropy(dataset):
    counts = class_counts(dataset)
    entrop = 0
    for lbl in counts:
        entrop -= counts[lbl]/float(len(dataset))*math.log2(counts[lbl]/float(len(dataset)))
    return entrop

In [76]:
current_entropy = entropy(data)
current_entropy

0.9402859586706311

In [77]:
def info_gain(dataset, att_index):
    current_entropy = entropy(dataset)
    information_gain = current_entropy
    values = unique_vals(dataset, att_index)
    for val in values:
        new_data = split_dataset(dataset, Question(att_index, val))
        en_nd = entropy(new_data)
        information_gain -= ((len(new_data)/float(len(dataset)))*en_nd)
    return information_gain
            

In [78]:
info_gain(data, 0)

0.24674981977443933

In [79]:
info_gain(data,1)

0.02922256565895487

In [80]:
info_gain(data,2)

0.15183550136234159

In [81]:
def find_best_split(dataset):
    best_gain = 0
    best_index = None
    n_features = len(dataset)-1
    for col in range(n_features):
        ig = info_gain(dataset, col)
        if ig > best_gain:
            best_gain = ig
            best_index = col
    return best_gain, best_index

In [82]:
best_gain, best_index = find_best_split(data)
print(best_gain)
print(best_index)

IndexError: list index out of range