In [6]:
import numpy as np

def KL(P,Q):
     epsilon = 0.00001

     # You may want to instead make copies to avoid changing the np arrays.
     P = P+epsilon
     Q = Q+epsilon

     divergence = np.sum(P*np.log(P/Q))
     return divergence

# Should be normalized though
values1 = np.asarray([0.9,0,1])
values2 = np.asarray([1,0,1])

# Note slight difference in the final result compared to Dawny33
print(KL(values1, values2)) # 0.775278939433

-0.09482451769664475


In [4]:
def calculate_kl_divergence_binary(teacher_probs, y_true):
    """
    Calculate KL divergence between teacher predictions and ground truth for binary classification.
    
    Parameters:
        teacher_probs: 2D array with shape (n_samples, 2) - [prob_class_0, prob_class_1]
        y_true: 1D array with true labels (0 or 1)
    """
    eps = 1e-8
    
    # Create one-hot encoded ground truth
    y_true_onehot = np.zeros((len(y_true), 2))
    y_true_onehot[np.arange(len(y_true)), y_true] = 1
    
    # Clip teacher probabilities to avoid log(0)
    teacher_probs_safe = np.clip(teacher_probs, eps, 1 - eps)
    
    # Calculate KL divergence for each sample
    kl_divs = []
    for i in range(len(y_true)):
        # KL(true || predicted) = sum(true * log(true / predicted))
        kl = np.sum(y_true_onehot[i] * np.log(y_true_onehot[i] + eps) - 
                    y_true_onehot[i] * np.log(teacher_probs_safe[i]))
        kl_divs.append(kl)
    
    return np.mean(kl_divs)

# Test with binary classification data
teacher_probs = np.array([[0.3, 0.7], [0.8, 0.2], [0.1, 0.9]])  # Teacher predictions
y_true = np.array([1, 0, 1])  # Ground truth labels

kl_div = calculate_kl_divergence_binary(teacher_probs, y_true)
print(f"KL Divergence: {kl_div:.4f}")

KL Divergence: 0.2284


In [7]:
def calculate_kl_divergence_binary_single_prob(teacher_probs_pos, y_true):
    """
    Calculate KL divergence between teacher predictions and ground truth for binary classification.
    
    Parameters:
        teacher_probs_pos: 1D array with probabilities for positive class (shape: n_samples,)
        y_true: 1D array with true labels (0 or 1)
    """
    eps = 1e-8
    
    # Clip probabilities to avoid log(0)
    p_pos = np.clip(teacher_probs_pos, eps, 1 - eps)
    p_neg = 1 - p_pos
    
    # Calculate KL divergence for each sample
    kl_divs = []
    for i in range(len(y_true)):
        if y_true[i] == 1:  # True label is positive class
            # KL(true || predicted) = 1 * log(1 / p_pos) = -log(p_pos)
            kl = -np.log(p_pos[i])
        else:  # True label is negative class
            # KL(true || predicted) = 1 * log(1 / p_neg) = -log(p_neg)
            kl = -np.log(p_neg[i])
        kl_divs.append(kl)
    
    return np.mean(kl_divs)

# Test with your data format
teacher_probs_pos = np.array([0.7, 0.2, 0.9])  # Just positive class probabilities
y_true = np.array([1, 0, 1])  # Ground truth labels

kl_div = calculate_kl_divergence_binary_single_prob(teacher_probs_pos, y_true)
print(f"KL Divergence: {kl_div:.4f}")

# This is equivalent to negative log-likelihood!
# You can also use this even simpler version:
def simple_kl_binary(probs_pos, y_true):
    eps = 1e-8
    probs_pos = np.clip(probs_pos, eps, 1 - eps)
    
    # For binary classification with one-hot ground truth, KL divergence = negative log-likelihood
    log_likelihood = y_true * np.log(probs_pos) + (1 - y_true) * np.log(1 - probs_pos)
    return -np.mean(log_likelihood)

kl_simple = simple_kl_binary(teacher_probs_pos, y_true)
print(f"KL Divergence (simple): {kl_simple:.4f}")

KL Divergence: 0.2284
KL Divergence (simple): 0.2284
