# Example to Calculate the values of Entropy, Gini Impurity and Information Gain

In [53]:
# Import Libraries
import pandas as pd
import math


In [54]:
# Create a example dataset
data = {
    'Fruit': ['Apple', 'Apple', 'Orange', 'Orange', 'Apple',
              'Orange', 'Apple', 'Orange', 'Apple', 'Orange',
              'Apple', 'Apple', 'Orange', 'Orange', 'Apple'],
    'Sweetness': ['High', 'Low', 'Low', 'High', 'High',
                  'Low', 'High', 'Low', 'High', 'Low',
                  'High', 'Low', 'High', 'Low', 'High'],
    'Color': ['Red', 'Red', 'Orange', 'Orange', 'Red',
              'Orange', 'Red', 'Orange', 'Red', 'Orange',
              'Red', 'Red', 'Orange', 'Orange', 'Red']
}

df = pd.DataFrame(data)
print(df)

     Fruit Sweetness   Color
0    Apple      High     Red
1    Apple       Low     Red
2   Orange       Low  Orange
3   Orange      High  Orange
4    Apple      High     Red
5   Orange       Low  Orange
6    Apple      High     Red
7   Orange       Low  Orange
8    Apple      High     Red
9   Orange       Low  Orange
10   Apple      High     Red
11   Apple       Low     Red
12  Orange      High  Orange
13  Orange       Low  Orange
14   Apple      High     Red


### We want to split the dataset based on the feature "Sweetness."
### So we found the following stances in Fruit column

In [55]:
# Check the ocurrences in fruit column
df.value_counts('Fruit')

Fruit
Apple     8
Orange    7
Name: count, dtype: int64

In [56]:
# Find the number of instances for apples and oranges in the dataset
apple_count = df['Fruit'].value_counts()['Apple']
orange_count = df['Fruit'].value_counts()['Orange']
total_count = apple_count + orange_count
print("Apple count: ", apple_count)
print("Orange count: ", orange_count)
print("Total count: ", total_count)

Apple count:  8
Orange count:  7
Total count:  15


In [57]:
# Splitting further into high and low from Sweetness column
apple_high = df[(df['Fruit'] == 'Apple') & (df['Sweetness'] == 'High')].count()['Fruit']
apple_low = df[(df['Fruit'] == 'Apple') & (df['Sweetness'] == 'Low')].count()['Fruit']
orange_high = df[(df['Fruit'] == 'Orange') & (df['Sweetness'] == 'High')].count()['Fruit']
orange_low = df[(df['Fruit'] == 'Orange') & (df['Sweetness'] == 'Low')].count()['Fruit']
print("Apple high: ", apple_high)
print("Apple low: ", apple_low)
print("Orange high: ", orange_high)
print("Orange low: ", orange_low)

Apple high:  6
Apple low:  2
Orange high:  2
Orange low:  5


In [68]:
# Same result through groupby funtion
counts = df.groupby(['Fruit', 'Sweetness']).size()
apple_high = counts[('Apple', 'High')]
apple_low = counts[('Apple', 'Low')]
orange_high = counts[('Orange', 'High')]
orange_low = counts[('Orange', 'Low')]

print("Apple high:", apple_high)
print("Apple low:", apple_low)
print("Orange high:", orange_high)
print("Orange low:", orange_low)
print('------------------------')
print(counts)

Apple high: 6
Apple low: 2
Orange high: 2
Orange low: 5
------------------------
Fruit   Sweetness
Apple   High         6
        Low          2
Orange  High         2
        Low          5
dtype: int64


In [59]:
# let's calculate the proportions
p_apple = apple_count / total_count
p_orange = orange_count / total_count

# print the proportions
print("Proportion of A: ", p_apple)
print("Proportion of B: ", p_orange)

Proportion of A:  0.5333333333333333
Proportion of B:  0.4666666666666667


In [60]:
# Entropy Calculate
# Entropy is a measure of uncertainty
entropy = -p_apple * math.log2(p_apple) - p_orange * math.log2(p_orange)
print("Entropy: ", entropy)

Entropy:  0.9967916319816366


In [61]:
# gini impurity
# Gini impurity is a measure of misclassification
gini = 1- p_apple**2 - p_orange**2
print("Gini Impurity: ", gini)

Gini Impurity:  0.4977777777777777


In [64]:
# Information Gain
# Assuming a split on some feature divides the dataset into two subsets
# Subset 1: 6 apple_high, 2 of apple_low
# Subset 2: 2 orange_high, 5 of orange_low
# Entropy and size for each subset

p_apple_high = apple_high / (apple_high + apple_low)
p_apple_low = apple_low / (apple_high + apple_low)
entropy_1 = -p_apple_high * math.log2(p_apple_high) - p_apple_low * math.log2(p_apple_low) if p_apple_high and p_apple_low else 0

p_orange_high = orange_high / (orange_high + orange_low)
p_orange_low = orange_low / (orange_high + orange_low)
entropy_2 = -p_orange_high * math.log2(p_orange_high) - p_orange_low * math.log2(p_orange_low) if p_orange_high and p_orange_low else 0

# Calculating information gain
info_gain = entropy - ((apple_high + apple_low) / total_count * entropy_1 + (orange_high + orange_low) / total_count * entropy_2)
print("Information Gain: ", info_gain)

Information Gain:  0.16132036693900464


1. **Entropy:** When the entropy is approximately 0.9967916319816366, it means that the dataset is highly impure or uncertain in terms of its class distribution. With an entropy value of approximately 0.9967916319816366, it suggests that the classes (e.g., "Apple" and "Orange") are fairly evenly distributed within the dataset.

2. **Gini Impurity:** When the Gini impurity is approximately 0.4977777777777777, it indicates the level of impurity or inequality in the class distribution within the dataset. With a Gini impurity value of approximately 0.4977777777777777, it indicates a moderate level of impurity in the dataset.

3. **Information Gain:** When the information gain is approximately 0.16132036693900464, it represents the amount of information gained by splitting the dataset based on a particular feature. With an information gain value of approximately 0.16132036693900464, it suggests that splitting the dataset based on the chosen feature (which was not specified) results in a moderate reduction in entropy or impurity.