In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
df = pd.read_csv("Why are employees leaving.csv")  #Loading file into my Notebook
df.head()

Unnamed: 0,ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Division,salary
0,1,0.38,0.53,2,157,3,0,1,0,sales,low
1,2,0.8,0.86,5,262,6,0,1,0,sales,medium
2,3,0.11,0.88,7,272,4,0,1,0,sales,medium
3,4,0.72,0.87,5,223,5,0,1,0,sales,low
4,5,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
# Calculate entropy based on the 'left' column
import math

def calculate_entropy(data, target_column):
    """
    Calculate entropy for a given target column
    Entropy = -Σ(p * log2(p)) where p is the probability of each class
    """
    # Get value counts and probabilities
    value_counts = data[target_column].value_counts()
    total_samples = len(data)
    
    entropy = 0
    # print(f"Distribution of '{target_column}' column:")
    # print(f"Total samples: {total_samples}")
    
    for value, count in value_counts.items():
        probability = count / total_samples
        if probability > 0:  # Avoid log(0)
            entropy -= probability * math.log2(probability)
        # print(f"  {target_column}={value}: {count} samples ({probability:.4f} probability)")
    
    return entropy

# Calculate entropy for the 'left' column
entropy_left = calculate_entropy(df, 'left')
# print(f"\n Baseline Entropy of the dataset based on 'left' column: {entropy_left:.4f}")



In [90]:
# INFORMATION GAIN CALCULATION FOR DISCRETE FEATURES (Example for a discrete feature)

def calculate_information_gain_discrete(data, feature_column, target_column):
    # Step 1: Calculate original entropy (baseline entropy of target variable)
    # print(f"=== INFORMATION GAIN ANALYSIS FOR '{feature_column.upper()}' COLUMN ===\n")
    
    original_entropy = calculate_entropy(data, target_column)
    # print(f"\nBASELINE ENTROPY of '{target_column}' column: {original_entropy:.4f}")
    
    # Step 2: Get unique values in the feature column to split on
    unique_values = data[feature_column].unique()
    # print(f"\nUNIQUE VALUES in '{feature_column}' column: {unique_values}")
    
    total_samples = len(data)
    weighted_entropy = 0
    
    for value in unique_values:
        # Create subset of data for this feature value
        subset = data[data[feature_column] == value]
        subset_size = len(subset)
        
        # Calculate entropy for this subset
        subset_entropy = calculate_entropy(subset, target_column)
        
        # Calculate weight (proportion of total data this subset represents)
        weight = subset_size / total_samples
        
        # Add to weighted entropy calculation
        weighted_entropy += weight * subset_entropy
    
    # Step 4: Calculate Information Gain
    information_gain = original_entropy - weighted_entropy
   
    # print(f"\nInformation Gain is {information_gain:.4f} for {feature_column} column")
    return information_gain

In [100]:
# Information Gain for Continuous Features

def calculate_information_gain_continuous(data, feature_column, target_column):
    # Sort the unique values of the feature column
    original_entropy = calculate_entropy(data, target_column)
    total_samples = len(data)
    
    sorted_values = np.sort(data[feature_column].unique())
    # print("Sorted values:", sorted_values, len(sorted_values))
    
    def create_feature_splits(sorted_values):
        modified_sortedValues = []
        for i in range(len(sorted_values) - 1):
            modified_sortedValues.append((sorted_values[i] + sorted_values[i+1])/2)
        return modified_sortedValues
    
    feature_splits = create_feature_splits(sorted_values)
    # print("Feature splits:", feature_splits, len(feature_splits))
    
    selected_split = feature_splits[0]
    max_information_gain = 0
    
    # Calculate the information gain for each range
    for i,_ in enumerate(feature_splits):
        # print(f"Split {i}: {feature_splits[i], sorted_values[i], sorted_values[i+1]}")
        subset1 = df[df[feature_column] >= feature_splits[i]]
        subset1_size = len(subset1)
        subset1_entropy = calculate_entropy(subset1, target_column)
        weight1 = subset1_size / total_samples
        # print("Subset:", len(subset))
        
        subset2 = df[df[feature_column] < feature_splits[i]]
        subset2_size = len(subset2)
        subset2_entropy = calculate_entropy(subset2, target_column)
        weight2 = subset2_size / total_samples

        # print(f"Split {i}: {feature_splits[i]}")
        # print(f"Subset1: {subset1_size} samples, entropy: {subset1_entropy:.4f}")
        # print(f"Subset2: {subset2_size} samples, entropy: {subset2_entropy:.4f}")

        weighted_entropy = weight1 * subset1_entropy + weight2 * subset2_entropy
        information_gain = original_entropy - weighted_entropy

        if information_gain > max_information_gain:
            max_information_gain = information_gain
            selected_split = feature_splits[i]
        # print("Subset1:", len(subset1))
    # print(f"\nMax information gain is {max_information_gain:.4f} for {feature_column} column and split is {selected_split}")
    return [selected_split, max_information_gain]

# calculate_information_gain_continuous(df, 'satisfaction_level', 'left')


In [101]:
# df.describe()
def calculate_max_information_gain(df):
    information_gains = [
        calculate_information_gain_continuous(df, 'satisfaction_level', 'left')[1],
        calculate_information_gain_continuous(df, 'last_evaluation', 'left')[1],
        calculate_information_gain_discrete(df, 'salary', 'left'),
        calculate_information_gain_discrete(df, 'Division', 'left'),
        calculate_information_gain_continuous(df, 'average_montly_hours', 'left')[1],
        calculate_information_gain_continuous(df, 'time_spend_company', 'left')[1],
        calculate_information_gain_continuous(df, 'number_project', 'left')[1],
        calculate_information_gain_discrete(df, 'Work_accident', 'left'),
        calculate_information_gain_discrete(df, 'promotion_last_5years', 'left')
    ]
    max_information_gain = max(information_gains)
    print(f"\nMax information gain is {max_information_gain:.4f} and feature is at index {information_gains.index(max_information_gain)} in the list")
    return max_information_gain

In [None]:
#STEP 1 => ROOT NODE. Data set is the whole data frame.
calculate_max_information_gain(df)

print(f"{calculate_information_gain_continuous(df, 'satisfaction_level', 'left')[0]}")


Max information gain is 0.1926 and feature is at index 0 in the list
0.46499999999999997
