In [1]:
import numpy as np
import pandas as pd

train = {'outlook': ['sunny', 'sunny', 'overcast', 'rainy', 'rainy', 'rainy', 'overcast', 
    'sunny', 'sunny', 'rainy', 'sunny', 'overcast', 'overcast', 'rainy',],
    'temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'humidity': [85, 90, 86, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 91],
    'windy': [False, True, False, False, False, True, True, False, False, False, True, True, False, True],
    'play': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

train_df = pd.DataFrame(train)
train_df

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes
5,rainy,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rainy,75,80,False,yes


In [2]:
# divide the temperature into 3 intervals: ...< 70, 70 <= ... < 80, 80 <= ... 
# and represent these intervals with 1, 2, 3.
train_df.loc[train_df['temperature'] < 70, 'temperature'] = 1
train_df.loc[(train_df['temperature'] >= 70) & (train_df['temperature'] < 80), 'temperature'] = 2
train_df.loc[train_df['temperature'] >= 80, 'temperature'] = 3

# divide the humidity into 4 intervals: ... < 70, 70  <= ... < 80, 80 <= ... < 90, 90 <= ...
# and represent these intervals with 1, 2, 3, 4

train_df.loc[train_df['humidity'] < 70, 'humidity'] = 1
train_df.loc[(train_df['humidity'] >= 70) & (train_df['humidity'] < 80), 'humidity'] = 2
train_df.loc[(train_df['humidity'] >= 80) & (train_df['humidity'] < 90), 'humidity'] = 3
train_df.loc[train_df['humidity'] >= 90, 'humidity'] = 4

train_df

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,3,3,False,no
1,sunny,3,4,True,no
2,overcast,3,3,False,yes
3,rainy,2,4,False,yes
4,rainy,1,3,False,yes
5,rainy,1,2,True,no
6,overcast,1,1,True,yes
7,sunny,2,4,False,no
8,sunny,1,2,False,yes
9,rainy,2,3,False,yes


In [3]:
test = {'outlook': ['overcast'], 'temperature': [60], 'humidity': [62], 'windy': [False]}
test_df = pd.DataFrame(test)
test_df

Unnamed: 0,outlook,temperature,humidity,windy
0,overcast,60,62,False


In [4]:
# map the temperature and humidity to the coresponding intervals.
test_df.loc[test_df['temperature'] < 70, 'temperature'] = 1
test_df.loc[(test_df['temperature'] >= 70) & (test_df['temperature'] < 80), 'temperature'] = 2
test_df.loc[test_df['temperature'] >= 80, 'temperature'] = 3

test_df.loc[test_df['humidity'] < 70, 'humidity'] = 1
test_df.loc[(test_df['humidity'] >= 70) & (test_df['humidity'] < 80), 'humidity'] = 2
test_df.loc[(test_df['humidity'] >= 80) & (test_df['humidity'] < 90), 'humidity'] = 3
test_df.loc[test_df['humidity'] >= 90, 'humidity'] = 4

test_df

Unnamed: 0,outlook,temperature,humidity,windy
0,overcast,1,1,False


In [5]:
class DecisionTree:
    def __init__(self, data, criterion='information gain'):
        '''
        criterion: information gain, gain ratio, Gini index
        '''
        self.data = data
        self.criterion = criterion
        self.tree = self.createTree(data)
    
    def compute_criterion(self, data):
        res = 0.0
        if self.criterion == "Gini index":
            res = 1.0
        for cls in data[data.columns[-1]].unique().tolist():
            prob = data[data[data.columns[-1]] == cls].shape[0] * 1.0 / data.shape[0]
            if self.criterion == "Gini index":
                res -= prob * prob
            else:
                res += prob * np.log2(prob)
        return res
    
    def information_gain(self, data):
        entropy, conditional_entropy, gini_index = [], [], []
        for i in range(data.shape[1] - 1):
            info_entropy, cond_entropy, gini = 0.0, 0.0, 0.0
            for feat in data[data.columns[i]].unique().tolist():
                split_data = data[data[data.columns[i]] == feat]
                prob = split_data.shape[0] * 1.0 / data.shape[0]
                info_entropy -= prob * np.log2(prob)
                cond_entropy -= prob * self.compute_criterion(split_data)
                gini += prob * self.compute_criterion(split_data)
            entropy.append(info_entropy)
            conditional_entropy.append(cond_entropy)
            gini_index.append(gini)
        return entropy, conditional_entropy, gini_index
    
    def select_feat(self, data):
        entropy, conditional_entropy, gini_index = self.information_gain(data)
        origin_entropy = - self.compute_criterion(data)
        info_gain = origin_entropy - np.array(conditional_entropy)
        if self.criterion == "information gain":
            return np.argmax(info_gain)
        if self.criterion == "gain ratio":
            best_feat, max_ratio = 0, 0.0
            for i in range(len(entropy)):
                gain_ratio = info_gain[i] / (entropy[i] + 1e-8)
                if gain_ratio > max_ratio:
                    best_feat, max_ratio = i, gain_ratio
            return best_feat
        return np.argmax(gini_index)
    
    def createTree(self, data):
        if len(data) == 0:
            return None
        if len(set(data[data.columns[-1]].values)) == 1:
            return data.iloc[0, -1]
        best_feat = self.select_feat(data)
        feat_name = data.columns[best_feat]
        tree = {feat_name:{}}
        for f in data[feat_name].unique().tolist():
            split_data = data[data[feat_name] == f].copy()
            split_data.drop(feat_name, axis=1, inplace=True)
            tree[feat_name][f] = self.createTree(split_data)
        return tree
    
    def classify(self, data):
        def predict(tree, cur_data):
            if type(tree) == type('str'):
                return tree
            rt = list(tree.keys())[0]
            for key, value in tree[rt].items():
                if key == cur_data[rt]:
                    return predict(tree[rt][key], cur_data)
        label = []
        for i in range(data.shape[0]):
            label.append(predict(self.tree, data.iloc[i,]))
        return label

In [6]:
# information gain
DT_info_gain = DecisionTree(train_df)
print(DT_info_gain.tree)
DT_info_gain.classify(test_df)

{'outlook': {'sunny': {'humidity': {3: 'no', 4: 'no', 2: 'yes'}}, 'overcast': 'yes', 'rainy': {'windy': {False: 'yes', True: 'no'}}}}


['yes']

In [7]:
# gain ratio
DT_gain_ratio = DecisionTree(train_df, 'gain ratio')
print(DT_gain_ratio.tree)
DT_gain_ratio.classify(test_df)

{'outlook': {'sunny': {'humidity': {3: 'no', 4: 'no', 2: 'yes'}}, 'overcast': 'yes', 'rainy': {'windy': {False: 'yes', True: 'no'}}}}


['yes']

In [8]:
# Gini index
DT_gini_index = DecisionTree(train_df, 'Gini index')
print(DT_gini_index.tree)
DT_gini_index.classify(test_df)

{'temperature': {3: {'windy': {False: {'humidity': {3: {'outlook': {'sunny': 'no', 'overcast': 'yes'}}, 2: 'yes'}}, True: 'no'}}, 2: {'windy': {False: {'humidity': {4: {'outlook': {'rainy': 'yes', 'sunny': 'no'}}, 3: 'yes'}}, True: {'humidity': {2: 'yes', 4: {'outlook': {'overcast': 'yes', 'rainy': 'no'}}}}}}, 1: {'outlook': {'rainy': {'humidity': {3: 'yes', 2: 'no'}}, 'overcast': 'yes', 'sunny': 'yes'}}}}


['yes']