In [4]:
import numpy as np

def entropy(p):
    H = np.array([-i*np.log2(i) for i in p]).sum()
    return H
    
p = [.5, .5]
print(entropy(p))

p = [.9, .1]
print(entropy(p))

1.0
0.4689955935892812


In [10]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.tree import export_graphviz
import pydotplus

# read in the tennis data, need the extra parameters since it's a txt file
data = pd.read_csv('Notebooks/Datasets/tennis.txt', delimiter="\t", header=None, names=['Outlook', 'Temp', 'Humidity', 'Wind', 'Play'])
print(data)

# encode the data so we can use it with our decision tree,
# by converting categories into numbers
data_encoded = data.apply(preprocessing.LabelEncoder().fit_transform)
print(data_encoded)

# create our decision tree classifier with entropy
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

# one_hot_data = pd.get_dummies(data[['a', 'b', 'c', 'd']], drop_first=True)
# print(one_hot_data)

# provide our feature array and target array (1-item),
# and train the model using a decision tree
clf.fit(data_encoded[['Outlook', 'Temp', 'Humidity', 'Wind']], data_encoded['Play'])

# export our decision tree into data that can be plotted
dot_data = export_graphviz(clf, out_file=None, feature_names=['Outlook', 'Temp.', 'Humidity', 'Wind'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

     Outlook  Temp Humidity    Wind Play
1      Sunny   Hot     High    Weak   No
2      Sunny   Hot     High  Strong   No
3   Overcast   Hot     High    Weak  Yes
4       Rain  Mild     High    Weak  Yes
5       Rain  Cool   Normal    Weak  Yes
6       Rain  Cool   Normal  Strong   No
7   Overcast  Cool   Normal  Strong  Yes
8      Sunny  Mild     High    Weak   No
9      Sunny  Cool   Normal    Weak  Yes
10      Rain  Mild   Normal    Weak  Yes
11     Sunny  Mild   Normal  Strong  Yes
12  Overcast  Mild     High  Strong  Yes
13  Overcast   Hot   Normal    Weak  Yes
14      Rain  Mild     High  Strong   No
    Outlook  Temp  Humidity  Wind  Play
1         2     1         0     1     0
2         2     1         0     0     0
3         0     1         0     1     1
4         1     2         0     1     1
5         1     0         1     1     1
6         1     0         1     0     0
7         0     0         1     0     1
8         2     2         0     1     0
9         2     0        

In [12]:
import pandas as pd
data = pd.read_csv('notebooks/datasets/tennis.txt', delimiter="\t", header=None, names=['Outlook', 'Temp', 'Humidity', 'Wind', 'Play'])
print(data)

     Outlook  Temp Humidity    Wind Play
1      Sunny   Hot     High    Weak   No
2      Sunny   Hot     High  Strong   No
3   Overcast   Hot     High    Weak  Yes
4       Rain  Mild     High    Weak  Yes
5       Rain  Cool   Normal    Weak  Yes
6       Rain  Cool   Normal  Strong   No
7   Overcast  Cool   Normal  Strong  Yes
8      Sunny  Mild     High    Weak   No
9      Sunny  Cool   Normal    Weak  Yes
10      Rain  Mild   Normal    Weak  Yes
11     Sunny  Mild   Normal  Strong  Yes
12  Overcast  Mild     High  Strong  Yes
13  Overcast   Hot   Normal    Weak  Yes
14      Rain  Mild     High  Strong   No


In [7]:
# inputs: dataset (df), a feature from the dataset (feature), and the target (decision)
# returns: information gain between feature and decision
def info_gain(df, feature, decision):
    # obtain the entropy of the decision
    dict_decision = dict(df[decision].value_counts())
    prob_decision = [q for (p,q) in dict_decision.items()]/sum(dict_decision.values())
    entropy_decision = entropy(prob_decision)
#     print(entropy_decision)
    
    # obtain the probabilities of the feature
    # example: for Wind, obtain the probabilities of Strong and Weak
    dict_feature = dict(df[feature].value_counts())
    dict_prob_feature = {}
    for (p,q) in dict_feature.items():
        dict_prob_feature[p] = q/sum(dict_feature.values())
#     print(dict_prob_feature)
    
    # obtain the probability of the decision,
    # for all possible values of the feature (conditions)
    conditions = df[feature].unique()
    dict_ = {}
    for condition in conditions:
        dict_[condition] = conditional_prob(df, feature, decision, condition)
#     print(dict_)
    
    # Given the above metrics, calculate the information gain
    # between the feature and the decision using the formula we learned
    S = 0
    for (i,j) in dict_.items():
#         print(i,j)
        prob_condition = list(dict_[i].values())
#         print(entropy_condition)
        S = S + dict_prob_feature[i]*entropy(prob_condition)
#         print(dict_prob_feature[i]*entropy(entropy_condition))
    print(entropy_decision - S)

In [13]:
# hint: helper function for entopy
import numpy as np

def entropy(p):
    H = np.array([-i*np.log2(i) for i in p]).sum()
    return H

In [14]:
# hint: helper function that takes a dataset (df) and one of its features (c1),
# decision (c2), and condition of the feature (condition) as input, and outputs
# the condiitional probability
def conditional_prob(df, c1, c2, condition):
    df_new = df[df[c1] == condition][c2]
    s = df_new.unique()
    population_size = len(df_new)
    pr = {}
    for i in s:
        pr[i] = len(df[(df[c1] == condition) & (df[c2]== i)]) / population_size
    
    return pr

# what are the probabilities of Play  given Wind is Weak?
print(conditional_prob(data,'Wind', 'Play', 'Weak'))

# what are the probabilities of Play given Wind is Strong?
print(conditional_prob(data, 'Wind', 'Play', 'Strong'))

{'No': 0.25, 'Yes': 0.75}
{'No': 0.5, 'Yes': 0.5}


In [16]:
info_gain(data, 'Wind', 'Play')
info_gain(data, 'Humidity', 'Play')
info_gain(data, 'Temp', 'Play')
info_gain(data, 'Outlook', 'Play')

0.04812703040826949
0.15183550136234159
0.02922256565895487
0.24674981977443933


In [17]:
for i in ['Outlook', 'Temp', 'Humidity', 'Wind', 'Play']:
    p = [m/sum(data[i].value_counts().to_dict().values()) for m in list(data[i].value_counts().to_dict().values())]
    print(entropy(p))
    info_gain(data, i, i)

1.5774062828523454
1.5774062828523454
1.5566567074628228
1.5566567074628228
1.0
1.0
0.9852281360342515
0.9852281360342515
0.9402859586706311
0.9402859586706311


In [19]:
data['Temp'].value_counts().to_dict()

{'Mild': 6, 'Hot': 4, 'Cool': 4}

In [20]:
list(data['Temp'].value_counts().to_dict().values())

[6, 4, 4]

In [21]:
sum(data['Temp'].value_counts().to_dict().values())

14

In [22]:
p = [m/sum(data['Temp'].value_counts().to_dict().values()) for m in list(data['Temp'].value_counts().to_dict().values())]

print(entropy(p))

1.5566567074628228
