In [1]:
import math
import pandas

In [2]:
def calculate_entropy(feature):
    entropy = 0
    categories = feature.unique()
    total_count = feature.shape[0]
    for category in categories:
        cat_count =  feature[feature == category].count()
        entropy += cat_count / total_count * math.log(cat_count / total_count, 2) if cat_count > 0 else 0
    return -1 * entropy

In [3]:
def calculate_entropy_per_feature(df, feature_name, target_class):
    categories = df[feature_name].unique()
    entropies = {}
    
    for i, category in enumerate(categories):
        current_entropy = calculate_entropy(df[df[feature_name] == categories[i]][target_class])
        entropies[category] = current_entropy
        
    return entropies

In [40]:
def print_feature_entropy(entropies):
    for entropy in entropies:
        print("Entropy({} = {})\t=\t{}".format(feature, entropy, round(entropies[entropy], 4)))

In [29]:
def calculate_information_gain(overall_entropy, feature, entropy_per_feature):
    local_entropy = 0
    for category in entropy_per_feature:
        local_entropy += (feature[feature == category].shape[0] / feature.shape[0]) \
                    * entropy_per_feature[category]
    return overall_entropy - local_entropy

# Question 1

In [6]:
df1 = pandas.read_csv('Tutorials/tuto2_table1.txt', sep=" ")

print(df1)

    Name    Hair   Height    Build Lotion     Result
1  Sarah  blonde  average    light     no  sunburned
2   Dana  blonde     tall  average    yes       none
3   Alex   brown    short  average    yes       none
4  Annie  blonde    short  average     no  sunburned
5  Emily     red  average    heavy     no  sunburned
6   Pete   brown     tall    heavy     no       none
7   John   brown  average    heavy     no       none
8  Katie   brown    short    light    yes       none


## A/ Dataset entropy

In [7]:
q1_overall_entropy = calculate_entropy(df1['Result'])

print("Question:\t What is the entropy of this dataset with respect to the target classlabel Result?")
print("Answer\t\t", round(q1_overall_entropy, 4))

Question:	 What is the entropy of this dataset with respect to the target classlabel Result?
Answer		 0.9544


## B/ Decision tree & feature selection

### Step 1: Calculate overall dataset entropy (cf Q1/A/)

### Step 2: Calculate entropy for each feature

In [44]:
list_features = list(df1.columns.values)
list_features.remove("Name")
list_features.remove("Result")

target_class = 'Result'

entropy_per_feature = {}

for feature in list_features:
    curr_entropy = calculate_entropy_per_feature(df1, feature, target_class)
    entropy_per_feature[feature] = curr_entropy
    print("Feature:", feature)
    print()
    print_feature_entropy(curr_entropy)
    print()
    print()

Feature: Hair

Entropy(Hair = blonde)	=	0.9183
Entropy(Hair = red)	=	-0.0
Entropy(Hair = brown)	=	-0.0


Feature: Height

Entropy(Height = average)	=	0.9183
Entropy(Height = tall)	=	-0.0
Entropy(Height = short)	=	0.9183


Feature: Build

Entropy(Build = heavy)	=	0.9183
Entropy(Build = average)	=	0.9183
Entropy(Build = light)	=	1.0


Feature: Lotion

Entropy(Lotion = no)	=	0.971
Entropy(Lotion = yes)	=	-0.0




### Step 3: Calculate Information Gain for each feature

In [50]:
information_gain_per_feature = {}

for feature in list_features:
    curr_information_gain = calculate_information_gain(q1_overall_entropy, 
                                                       df1[feature],
                                                       entropy_per_feature[feature])
    information_gain_per_feature[feature] = curr_information_gain
    print("Feature:\t\t", feature)
    print("Information gain:\t", round(curr_information_gain, 4))
    print()

Feature:		 Hair
Information gain:	 0.6101

Feature:		 Height
Information gain:	 0.2657

Feature:		 Build
Information gain:	 0.0157

Feature:		 Lotion
Information gain:	 0.3476



In [54]:
print("Question:\nConstruct the decision tree that would be built with Information Gain for this dataset. Show your work for selection of the root feature in your tree.")

print()

print("Answer:\n\"Hair\" will be selected as it is the feature with the highest IG value. It perfectly classifies the data for Hair=brown & Hair=red. It will be used to split the root node of the tree. The case for Hair=blonde contains (2 sunburned, 1 none). We Can split these into pure child nodes using feature \"Lotion\".")

Question:
Construct the decision tree that would be built with Information Gain for this dataset. Show your work for selection of the root feature in your tree.

Answer:
"Hair" will be selected as it is the feature with the highest IG value. It perfectly classifies the data for Hair=brown & Hair=red. It will be used to split the root node of the tree. The case for Hair=blonde contains (2 sunburned, 1 none). We Can split these into pure child nodes using feature "Lotion".


## C/ New element classification

In [56]:
# hard-coded tree
def decision_tree(hair_color, applied_lotion):
    if hair_color == "Blonde":
        if applied_lotion:
            return False
        return True
    return False

new_person = {"Name": "Dana", 
              "Hair": "Blond", 
              "Height": "tall", 
              "Build":"average", 
              "Lotion":True}

for data in new_person:
    print("{}: {}".format(data, new_person[data]))

Height: tall
Hair: Blond
Build: average
Name: Dana
Lotion: True


In [60]:
print("Question:\tUsing your decision tree from (b), how would you classify the following example X?")
sunburned = decision_tree(new_person['Hair'], new_person['Lotion'])
if sunburned:   
    print("Answer:\t\t{}".format("sunburned"))
else:
    print("Answer:\t\t{}".format("not sunburned"))

Question:	Using your decision tree from (b), how would you classify the following example X?
Answer:		not sunburned


# Question 2