In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
df = pd.read_csv("Why are employees leaving.csv")  #Loading file into my Notebook
df.head()

Unnamed: 0,ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Division,salary
0,1,0.38,0.53,2,157,3,0,1,0,sales,low
1,2,0.8,0.86,5,262,6,0,1,0,sales,medium
2,3,0.11,0.88,7,272,4,0,1,0,sales,medium
3,4,0.72,0.87,5,223,5,0,1,0,sales,low
4,5,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
# Calculate entropy based on the 'left' column
import math

def calculate_entropy(data, target_column):
    """
    Calculate entropy for a given target column
    Entropy = -Σ(p * log2(p)) where p is the probability of each class
    """
    # Get value counts and probabilities
    value_counts = data[target_column].value_counts()
    total_samples = len(data)
    
    entropy = 0
    # print(f"Distribution of '{target_column}' column:")
    # print(f"Total samples: {total_samples}")
    
    for value, count in value_counts.items():
        probability = count / total_samples
        if probability > 0:  # Avoid log(0)
            entropy -= probability * math.log2(probability)
        # print(f"  {target_column}={value}: {count} samples ({probability:.4f} probability)")
    
    return entropy

# Calculate entropy for the 'left' column
entropy_left = calculate_entropy(df, 'left')
# print(f"\n Baseline Entropy of the dataset based on 'left' column: {entropy_left:.4f}")



In [5]:
# INFORMATION GAIN CALCULATION FOR DIVISION COLUMN
# =================================================
# Information Gain measures how much information a feature provides about the target variable.
# It quantifies the reduction in entropy achieved by splitting the dataset on a particular feature.
# Formula: Information Gain = Entropy(Parent) - Weighted Average of Entropy(Children)

def calculate_information_gain(data, feature_column, target_column):
    """
    Calculate Information Gain for a given feature column
    
    Steps:
    1. Calculate the original entropy of the target variable (baseline)
    2. Split the data based on unique values in the feature column
    3. For each subset, calculate entropy of the target variable
    4. Calculate weighted average entropy of all subsets
    5. Information Gain = Original Entropy - Weighted Average Entropy
    """
    
    # Step 1: Calculate original entropy (baseline entropy of target variable)
    print(f"=== INFORMATION GAIN ANALYSIS FOR '{feature_column.upper()}' COLUMN ===\n")
    
    original_entropy = calculate_entropy(data, target_column)
    print(f"\n1. BASELINE ENTROPY of '{target_column}' column: {original_entropy:.4f}")
    
    # Step 2: Get unique values in the feature column to split on
    unique_values = data[feature_column].unique()
    print(f"\n2. UNIQUE VALUES in '{feature_column}' column: {unique_values}")
    
    # Step 3: Calculate entropy for each subset created by splitting on feature column
    print(f"\n3. CALCULATING ENTROPY FOR EACH SUBSET:")
    print("-" * 50)
    
    total_samples = len(data)
    weighted_entropy = 0
    
    for value in unique_values:
        # Create subset of data for this feature value
        subset = data[data[feature_column] == value]
        subset_size = len(subset)
        
        # Calculate entropy for this subset
        subset_entropy = calculate_entropy(subset, target_column)
        
        # Calculate weight (proportion of total data this subset represents)
        weight = subset_size / total_samples
        
        # Add to weighted entropy calculation
        weighted_entropy += weight * subset_entropy
        
        print(f"\\n  Subset: {feature_column}='{value}'")
        print(f"  - Size: {subset_size} samples ({weight:.4f} of total)")
        print(f"  - Entropy: {subset_entropy:.4f}")
        print(f"  - Weighted contribution: {weight:.4f} × {subset_entropy:.4f} = {weight * subset_entropy:.4f}")
    
    # Step 4: Calculate Information Gain
    information_gain = original_entropy - weighted_entropy
    
    print(f"\\n4. WEIGHTED AVERAGE ENTROPY: {weighted_entropy:.4f}")
    print(f"\\n5. INFORMATION GAIN CALCULATION:")
    print(f"   Information Gain = Original Entropy - Weighted Average Entropy")
    print(f"   Information Gain = {original_entropy:.4f} - {weighted_entropy:.4f}")
    print(f"   Information Gain = {information_gain:.4f}")
    
   
    print(f"   Information Gain is {information_gain:.4f} for {feature_column} column")
    print(f"   The higher the Information Gain, the better the feature is for decision tree splitting.")
    return information_gain

# Calculate Information Gain for Division column
print("\\n" + "="*80)
division_info_gain = calculate_information_gain(df, 'Division', 'left')
print("\\n" + "="*80)


=== INFORMATION GAIN ANALYSIS FOR 'DIVISION' COLUMN ===


1. BASELINE ENTROPY of 'left' column: 0.7918

2. UNIQUE VALUES in 'Division' column: ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
 'product_mng' 'marketing' 'RandD']

3. CALCULATING ENTROPY FOR EACH SUBSET:
--------------------------------------------------
\n  Subset: Division='sales'
  - Size: 4140 samples (0.2760 of total)
  - Entropy: 0.8031
  - Weighted contribution: 0.2760 × 0.8031 = 0.2217
\n  Subset: Division='accounting'
  - Size: 767 samples (0.0511 of total)
  - Entropy: 0.8356
  - Weighted contribution: 0.0511 × 0.8356 = 0.0427
\n  Subset: Division='hr'
  - Size: 739 samples (0.0493 of total)
  - Entropy: 0.8699
  - Weighted contribution: 0.0493 × 0.8699 = 0.0429
\n  Subset: Division='technical'
  - Size: 2720 samples (0.1813 of total)
  - Entropy: 0.8210
  - Weighted contribution: 0.1813 × 0.8210 = 0.1489
\n  Subset: Division='support'
  - Size: 2229 samples (0.1486 of total)
  - Entropy: 0.809