In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
train_data = pd.read_csv('/content/drive/My Drive/play_tennis(1).csv')
train_data

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,day,outlook,temp,humidity,wind,play
1,D1,Sunny,Hot,High,Weak,No
2,D2,Sunny,Hot,High,Strong,No
3,D3,Overcast,Hot,High,Weak,Yes
4,D4,Rain,Mild,High,Weak,Yes
5,D5,Rain,Cool,Normal,Weak,Yes
6,D6,Rain,Cool,Normal,Strong,No
7,D7,Overcast,Cool,Normal,Strong,Yes
8,D8,Sunny,Mild,High,Weak,No
9,D9,Sunny,Cool,Normal,Weak,Yes


In [4]:
class_list=set(train_data['play'])
class_list

{'No', 'Yes'}

In [14]:
def calc_total_entropy(train_data,names, class_list):
    total_row = train_data.shape[0] 
    total_entr = 0
    for c in class_list: 
        total_class_count = train_data[train_data[names] == c].shape[0]
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
        total_entr += total_class_entr 
    
    return total_entr

In [21]:
def calc_entropy(feature_value_data, names, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[names] == c].shape[0] 
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count 
            entropy_class = - probability_class * np.log2(probability_class)
        entropy += entropy_class
    return entropy

In [15]:
def calc_info_gain(feature_name, train_data,names, class_list):
    feature_value_list = train_data[feature_name].unique() 
    total_row = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value] 
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data,names, class_list) 
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy 
        
    return calc_total_entropy(train_data,names, class_list) - feature_info 

In [20]:
def find_most_informative_feature(train_data,names, class_list):
    feature_list = train_data.columns.drop(label) 
                                            
    max_info_gain = -1
    max_info_feature = None
    
    for feature in feature_list:  
        feature_info_gain = calc_info_gain(feature, train_data,names, class_list)
        if max_info_gain < feature_info_gain: 
            max_info_gain = feature_info_gain
            max_info_feature = feature
            
    return max_info_feature

In [19]:
def generate_sub_tree(feature_name, train_data,names, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False) 
    tree = {} 
    
    for feature_value, count in feature_value_count_dict.iteritems():
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        
        assigned_to_node = False 
        for c in class_list: 
            class_count = feature_value_data[feature_value_data[names] == c].shape[0] 

            if class_count == count: 
                tree[feature_value] = c 
                train_data = train_data[train_data[feature_name] != feature_value] 
                assigned_to_node = True
        if not assigned_to_node: 
            tree[feature_value] = "?" 
            
    return tree, train_data

In [18]:
def make_tree(root, prev_feature_value, train_data,names, class_list):
    if train_data.shape[0] != 0: 
        max_info_feature = find_most_informative_feature(train_data,names, class_list) 
        tree, train_data = generate_sub_tree(max_info_feature, train_data,names, class_list) 
        next_root = None
        
        if prev_feature_value != None: 
            root[prev_feature_value] = dict()
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else: 
            root[max_info_feature] = tree
            next_root = root[max_info_feature]
        
        for node, branch in list(next_root.items()):
            if branch == "?": 
                feature_value_data = train_data[train_data[max_info_feature] == node] 
                make_tree(next_root, node, feature_value_data,names, class_list)

In [17]:
def id3(train_data_m, names):
    train_data = train_data_m.copy() 
    tree = {} 
    class_list = train_data[names].unique() 
    make_tree(tree, None, train_data_m,names, class_list) 
    return tree