Practical Problem

Question 1

In [13]:
import pandas as pd
import numpy as np
from math import log2

# Step 1: Load data (Step 1: Extract Relevant Data)
data = pd.read_csv('training_dataset.csv')

# Drop rows with missing RiskLevel (safe check for entropy calculation)
data = data.dropna(subset=['RiskLevel'])

# Function to compute entropy (used in Step 2, 3, 4)
def entropy(labels):
    total = len(labels)
    if total == 0:
        return 0
    probs = [labels.count(label) / total for label in set(labels)]
    return -sum(p * log2(p) for p in probs if p > 0)

# Step 2: Compute total entropy H(S)
total_entropy = entropy(data['RiskLevel'].tolist())


# Step 3: Split data at CreditScore = 650
split_value = 650
lower_group = data[data['CreditScore'] <= split_value]  # S1
upper_group = data[data['CreditScore'] > split_value]   # S2

# Step 4: Compute entropy of each group
entropy_lower = entropy(lower_group['RiskLevel'].tolist())  # H(S1)
entropy_upper = entropy(upper_group['RiskLevel'].tolist())  # H(S2)

# Step 4 (continued): Weighted average entropy after split
n = len(data)
weighted_entropy = (len(lower_group) / n) * entropy_lower + (len(upper_group) / n) * entropy_upper

# Step 5: Compute Information Gain
info_gain = total_entropy - weighted_entropy

# Step 6: Output results and interpretation
print(f"Total Entropy before split: {total_entropy:.4f}")                         # Step 2
print(f"Entropy (CreditScore <= {split_value}): {entropy_lower:.4f}")            # Step 3
print(f"Entropy (CreditScore > {split_value}): {entropy_upper:.4f}")             # Step 3
print(f"Weighted Entropy after split: {weighted_entropy:.4f}")                   # Step 4
print(f"Information Gain for split at {split_value}: {info_gain:.4f}")           # Step 5


Total Entropy before split: 1.0000
Entropy (CreditScore <= 650): -0.0000
Entropy (CreditScore > 650): -0.0000
Weighted Entropy after split: -0.0000
Information Gain for split at 650: 1.0000


Question 2

In [14]:
import pandas as pd

# Load training data
df = pd.read_csv('training_dataset.csv')

# Step 1: Drop rows with missing CreditScore (target)
df = df.dropna(subset=['CreditScore'])

# Step 2: Split data at Age = 35
split_value = 35
left_group = df[df['Age'] <= split_value]
right_group = df[df['Age'] > split_value]

# Step 3: Define variance function
def variance(values):
    if len(values) == 0:
        return 0
    return values.var(ddof=0)  # population variance

# Step 4: Calculate total variance before the split
total_variance = variance(df['CreditScore'])

# Step 5: Calculate weighted variance after split
left_var = variance(left_group['CreditScore'])
right_var = variance(right_group['CreditScore'])

n = len(df)
weighted_var = (len(left_group)/n) * left_var + (len(right_group)/n) * right_var

# Step 6: Calculate variance reduction
var_reduction = total_variance - weighted_var

# Step 7: Output results
print(f"Total Variance before split: {total_variance:.2f}")
print(f"Variance (Age <= {split_value}): {left_var:.2f}")
print(f"Variance (Age > {split_value}): {right_var:.2f}")
print(f"Weighted Variance after split: {weighted_var:.2f}")
print(f"Variance Reduction: {var_reduction:.2f}")


Total Variance before split: 3575.00
Variance (Age <= 35): 1576.00
Variance (Age > 35): 822.22
Weighted Variance after split: 1293.33
Variance Reduction: 2281.67
