In [39]:
import pandas as pd
import numpy as np

In [40]:
data = """Outlook Temp. Humidity Wind Decision 
Sunny Hot High Weak No
Sunny Hot High Strong No
Overcast Hot High Weak Yes
Rain Mild High Weak Yes
Rain Cool Normal Weak Yes
Rain Cool Normal Strong No
Overcast Cool Normal Strong Yes
Sunny Mild High Weak No
Sunny Cool Normal Weak Yes
Rain Mild Normal Weak Yes
Sunny Mild Normal Strong Yes
Overcast Mild High Strong Yes
Overcast Hot Normal Weak Yes
Rain Mild High Strong No"""

In [41]:
characters = data.split("\n")

In [42]:
title = characters.pop(0).split(" ")
del characters[0]


In [43]:
chars = [character.split(" ") for character in characters]

In [44]:
chars

[['Sunny', 'Hot', 'High', 'Strong', 'No'],
 ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
 ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 ['Sunny', 'Mild', 'High', 'Weak', 'No'],
 ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
 ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
 ['Rain', 'Mild', 'High', 'Strong', 'No']]

In [45]:
title

['Outlook', 'Temp.', 'Humidity', 'Wind', 'Decision', '']

In [46]:
data = pd.DataFrame(chars, columns = title[:-1])
data.head()
unique_items = dict()
for col in data.columns:
    unique_items[col] = np.array(data[col].unique())
# for col in data.columns:
#     data[f"_{col}"] = data[col].apply(lambda x: list(data[col].unique()).index(x))

In [47]:
data

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
0,Sunny,Hot,High,Strong,No
1,Overcast,Hot,High,Weak,Yes
2,Rain,Mild,High,Weak,Yes
3,Rain,Cool,Normal,Weak,Yes
4,Rain,Cool,Normal,Strong,No
5,Overcast,Cool,Normal,Strong,Yes
6,Sunny,Mild,High,Weak,No
7,Sunny,Cool,Normal,Weak,Yes
8,Rain,Mild,Normal,Weak,Yes
9,Sunny,Mild,Normal,Strong,Yes


In [48]:
data_series = data["Outlook"]
explored_set = list()

In [49]:
def word_count(data):
    items = data.unique()
    count = dict()
    #iterating through dict and initializing the count value of each unique item in column
    for item in items:
        count[item] = 0

    # iterating though the column and updating the each word count
    for i in data:
        for item in count:
            if i == item:
                count[item] += 1
    return count
    

In [50]:
col_word_frequency = dict()
for col in data.columns:
    col_word_frequency[col] = word_count(data[col])
    

In [51]:
col_word_frequency

{'Outlook': {'Sunny': 4, 'Overcast': 4, 'Rain': 5},
 'Temp.': {'Hot': 3, 'Mild': 6, 'Cool': 4},
 'Humidity': {'High': 6, 'Normal': 7},
 'Wind': {'Strong': 6, 'Weak': 7},
 'Decision': {'No': 4, 'Yes': 9}}

In [52]:
def check_positive_count(col, item = None):
    items = dict()
    if item:
        total_item = len(data[data[col] == item])
        positive_item = len(data[(data[col] == item) & (data["Decision"] == "Yes")])
        items = {"total" : total_item, "positive" : positive_item}
    else:
        total_item = len(data[col])
        positive_item = len((data[data["Decision"] == "Yes"]))
        items = {"positive" : positive_item, "total" : total_item}
    return items
    

In [53]:
check_positive_count("Outlook")

{'positive': 9, 'total': 13}

In [54]:
check_positive_count("Outlook", "Rain")

{'total': 5, 'positive': 3}

In [55]:
def get_column_entropy(col):
    unique_items = data[col].unique()
    entropy = 0
    for item in unique_items:
        positive_count = check_positive_count(col, item)
        positive_probablity = positive_count.get('positive')/positive_count.get('total')
        negative_probablity = (positive_count.get('total')-positive_count.get('positive'))/positive_count.get('total')
        new_entropy = -(positive_probablity)*np.log(positive_probablity)-(negative_probablity)*np.log(negative_probablity)
        if new_entropy > entropy:
            entropy = new_entropy
    return entropy

In [56]:
def get_root_count(data):
    columns = data["Decision"]
    count = dict()
    positive_no = len(data[data["Decision"] == "Yes"])
    count["positive"] = positive_no
    count["total"] = len(data["Decision"])
    return count

In [57]:
get_root_count(data)

{'positive': 9, 'total': 13}

In [58]:
def calculate_entropy(positive, total):
    if total== 0:
        return total
    positive_probablity = positive/total
    negative_probablity = (total-positive)/total
    entropy = -positive_probablity*np.log(positive_probablity)-negative_probablity*np.log(negative_probablity)
    return entropy

In [59]:
def calculate_gain(root_entropy, child_node_count, total_count, column):
    node_sum = 0
    for key, value in child_node_count.items():
        node_entropy = get_entropy(column, key)
        node_value = child_node_count[key]/total_count * node_entropy
        node_sum+= node_value      
    return root_entropy - node_sum
    

In [60]:
def get_column_details(column):
    column_count = check_positive_count(column)
    column_entropy = calculate_entropy(**column_count)
    child_node_count = word_count(data[column])

    column_details = {"Column" : column, "Count":column_count, 
                      "Column_entropy": column_entropy, "Node Count":child_node_count}

    node_entropy_list = []
    column_gain = column_entropy
    for key, value in child_node_count.items():
        node_count = check_positive_count(column, key)
        node_entropy = calculate_entropy(**node_count) if not np.isnan(calculate_entropy(**node_count)) else 0
        node_entropy_dict={"name" : key, "count" : node_count, "entropy":node_entropy}
        node_entropy_list.append(node_entropy_dict)
        column_gain -= (node_count['total']/column_count['total']) * node_entropy
    column_details["node_entropy"] = node_entropy_list
    column_details["gain"] = column_gain
    return column_details
    


In [61]:
data.columns

Index(['Outlook', 'Temp.', 'Humidity', 'Wind', 'Decision'], dtype='object')

In [62]:
get_column_details("Wind")

{'Column': 'Wind',
 'Count': {'positive': 9, 'total': 13},
 'Column_entropy': 0.6172417697303416,
 'Node Count': {'Strong': 6, 'Weak': 7},
 'node_entropy': [{'name': 'Strong',
   'count': {'total': 6, 'positive': 3},
   'entropy': 0.6931471805599453},
  {'name': 'Weak',
   'count': {'total': 7, 'positive': 6},
   'entropy': 0.410116318288409}],
 'gain': 0.07649582270122357}

In [63]:
def get_highest_gain():
    gain = 0
    high_gain_col = ""
    for col in data.columns[:-1]:
        if (col not in explored_set):
            new_gain = get_column_details(col)["gain"]
            if(new_gain > gain):
                gain = new_gain
                high_gain_col = col
    print("column : ", high_gain_col, "highest gain : ", gain)

In [64]:
get_highest_gain()

column :  Outlook highest gain :  0.145115073016029


  
  


In [65]:
column = "Outlook"
for node in data[column].unique():
    print(node)

Sunny
Overcast
Rain


In [66]:
for col in data.columns:
    print(col)
    for node in data[col].unique():
        print("    ",node)
    print()

Outlook
     Sunny
     Overcast
     Rain

Temp.
     Hot
     Mild
     Cool

Humidity
     High
     Normal

Wind
     Strong
     Weak

Decision
     No
     Yes



In [67]:
def get_child_count(parent_column,parent_node, current_column, current_node):
    total_count = len(data[(data[parent_column] == parent_node) & (data[current_column] == current_node)])
    positive_count = len(data[(data[parent_column] == parent_node) & 
                              (data[current_column] == current_node) & (data["Decision"] == "Yes")])
#     print("Current Node: ", f"{parent_node}-{current_node}","positive: ", positive_count, "total: ", total_count)
    return ({ "positive": positive_count, "total": total_count})

In [68]:
def get_child_nodes_entropy(column, node, child_column):
    child_node_list = []
    for node_2 in data[child_column].unique():
        child_column_count = get_child_count(column, node, child_column, node_2)
#         print("node", node,"node2 : ",node_2, "column_count", child_column_count)
        child_entropy = calculate_entropy(**child_column_count) if not np.isnan(calculate_entropy(**child_column_count)) else 0

        count_name = f"{node}-{node_2}"
        node2_dict = dict()
        node2_dict[count_name] = {"total": child_column_count['total'], "entropy": child_entropy}
        child_node_list.append(node2_dict)
    return child_node_list

In [69]:
def calculate_child_gain(column, node, child_column):
    explored_set.append(column)
    parent_count = check_positive_count(column, node)
    parent_entropy = calculate_entropy(**parent_count)
    
#     print("Parent count", parent_count)
#     print(parent_count)
#     print("column:", column)
#     print("Parent entropy:", parent_entropy)
    
    get_child_nodes = get_child_nodes_entropy(column, node, child_column)
    
    gain = parent_entropy 
#     print("Child Node Entropy: \n", get_child_nodes)



    for i in get_child_nodes:
        for key, value in i.items():
            total = i[key]["total"]
            entropy = i[key]['entropy']
            foo = (total/parent_count['total']) * entropy
            gain -= foo
#     print("child_gain :", gain )
    return gain

In [70]:
def get_remaining_column(explored_column_list):
    column_list = set(data.columns[:-1])
    new_column_list = set()
    if(len(explored_column_list) == 0):
        return column_list
    
    for i in column_list:
        if i not in explored_column_list:
            new_column_list.add(i);
    return new_column_list


In [71]:
data

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
0,Sunny,Hot,High,Strong,No
1,Overcast,Hot,High,Weak,Yes
2,Rain,Mild,High,Weak,Yes
3,Rain,Cool,Normal,Weak,Yes
4,Rain,Cool,Normal,Strong,No
5,Overcast,Cool,Normal,Strong,Yes
6,Sunny,Mild,High,Weak,No
7,Sunny,Cool,Normal,Weak,Yes
8,Rain,Mild,Normal,Weak,Yes
9,Sunny,Mild,Normal,Strong,Yes


In [72]:
def get_child_highest_gain(column, node):
    gain = 0;
    selected_combo = dict()
    for i, col in enumerate(data.columns[:-1]):
        if col != column and col not in explored_set:
#             print("column: ", column, "node: ", node, "col: ", col)
            new_gain = calculate_child_gain(column, node, col)
            if new_gain > gain:
                gain = new_gain
                selected_combo["column"] = col
                selected_combo["gain"] = gain
    return selected_combo

In [73]:
"""
For accurate result must restart and run all in kernel!!!
For accurate result must restart and run all in kernel!!!
For accurate result must restart and run all in kernel!!!
"""

column = "Outlook"
node = "Rain"
highest_gain = get_child_highest_gain(column, node)

explored_set.append(highest_gain['column'])
print(highest_gain)

{'column': 'Wind', 'gain': 0.6730116670092565}


  
  


In [82]:
col1 = "Outlook"
node1 = "Sunny"
col2 = "Humidity"
node2 = "Normal"
def get_select_list(col1, node1, col2, node2):
    return data[(data[col1] == node1) & (data[col2] == node2)]
get_select_list(col1, node1, col2, node2)

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
7,Sunny,Cool,Normal,Weak,Yes
9,Sunny,Mild,Normal,Strong,Yes


In [79]:
data[data["Outlook"] == "Overcast"]

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
1,Overcast,Hot,High,Weak,Yes
5,Overcast,Cool,Normal,Strong,Yes
10,Overcast,Mild,High,Strong,Yes
11,Overcast,Hot,Normal,Weak,Yes
