In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

def get_bin_edges(expected, bins, method="equal_width"):
    """
    Get bin edges based on the selected binning method.
    
    :param expected: Reference (expected) distribution (numpy array or list)
    :param bins: Number of bins (int) or specific bin edges (list)
    :param method: Binning method - "equal_width", "equal_freq", "adaptive", "kmeans", "domain"
    :return: Array of bin edges
    """
    if method == "equal_width":
        return np.linspace(min(expected), max(expected), bins + 1)
    
    elif method == "equal_freq":
        return np.percentile(expected, np.linspace(0, 100, bins + 1))
    
    elif method == "adaptive":
        X = np.array(expected).reshape(-1, 1)
        y = np.digitize(expected, bins=np.percentile(expected, np.linspace(0, 100, bins)))
        tree = DecisionTreeClassifier(max_leaf_nodes=bins)
        tree.fit(X, y)
        return np.sort(tree.tree_.threshold[tree.tree_.threshold > 0])
    
    elif method == "kmeans":
        data = np.array(expected).reshape(-1, 1)
        kmeans = KMeans(n_clusters=bins, random_state=42).fit(data)
        return np.sort(kmeans.cluster_centers_.flatten())

    elif method == "domain":
        return bins  # User-defined domain-specific bins (must be passed as a list)
    
    else:
        raise ValueError("Invalid binning method! Choose from ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']")

def psi(expected, actual, bins=10, method="equal_width"):
    """
    Compute the Population Stability Index (PSI) for given datasets with different binning strategies.

    :param expected: Reference (expected) distribution (numpy array or list)
    :param actual: New (actual) distribution (numpy array or list)
    :param bins: Number of bins or specific bin edges (if method='domain')
    :param method: Binning method - "equal_width", "equal_freq", "adaptive", "kmeans", "domain"
    :return: PSI value
    """
    # Get bin edges based on the selected method
    bin_edges = get_bin_edges(expected, bins, method)
    
    # Ensure unique bin edges to avoid issues
    bin_edges = np.unique(bin_edges)

    # Compute histogram counts
    expected_counts, _ = np.histogram(expected, bins=bin_edges)
    actual_counts, _ = np.histogram(actual, bins=bin_edges)

    # Convert counts to proportions
    expected_perc = expected_counts / np.sum(expected_counts)
    actual_perc = actual_counts / np.sum(actual_counts)

    # Replace zero values to avoid division errors
    expected_perc = np.where(expected_perc == 0, 0.0001, expected_perc)
    actual_perc = np.where(actual_perc == 0, 0.0001, actual_perc)

    # Compute PSI
    psi_values = (expected_perc - actual_perc) * np.log(expected_perc / actual_perc)
    
    return psi_values, np.sum(psi_values)



In [4]:
# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(50, 12, 1000)    # New distribution

# PSI Calculation using different binning methods
print(f"PSI (Equal Width Binning): {psi(expected_data, actual_data, bins=10, method='equal_width')}")
print(f"PSI (Equal Frequency Binning): {psi(expected_data, actual_data, bins=10, method='equal_freq')}")
print(f"PSI (Adaptive Binning): {psi(expected_data, actual_data, bins=10, method='adaptive')}")
print(f"PSI (K-Means Binning): {psi(expected_data, actual_data, bins=10, method='kmeans')}")
print(f"PSI (Domain-Specific Binning): {psi(expected_data, actual_data, bins=[30, 40, 50, 60, 70, 80], method='domain')}")


PSI (Equal Width Binning): (array([0.00891706, 0.01101519, 0.00013235, 0.01516027, 0.00251449,
       0.00220601, 0.01006259, 0.00795976, 0.01659779, 0.00222926]), 0.07679477316334239)
PSI (Equal Frequency Binning): (array([5.90194926e-03, 9.39455702e-04, 5.21960397e-03, 1.95209714e-03,
       1.62264126e-06, 5.75429165e-03, 3.78927007e-03, 2.00413458e-04,
       7.17927943e-04, 2.48506831e-02]), 0.04932731489765286)
PSI (Adaptive Binning): (array([3.79627450e-05, 2.73328709e-02, 2.55978315e-03, 1.69947346e-03,
       1.07829637e-02, 4.96809298e-05, 1.38587435e-05, 4.19946510e-02]), 0.08447124461224062)
PSI (K-Means Binning): (array([0.00526592, 0.00038487, 0.00351466, 0.00011868, 0.00380697,
       0.00241526, 0.0008164 , 0.01389471, 0.01194549]), 0.0421629553564869)
PSI (Domain-Specific Binning): (array([0.00091488, 0.00365869, 0.00750551, 0.01355882, 0.02260765]), 0.048245536629576466)


  super()._check_params_vs_input(X, default_n_init=10)


In [7]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

def get_bin_edges(expected, bins, method="equal_width"):
    """
    Get bin edges based on the selected binning method.
    
    :param expected: Reference (expected) distribution (numpy array or list)
    :param bins: Number of bins (int) or specific bin edges (list)
    :param method: Binning method - "equal_width", "equal_freq", "adaptive", "kmeans", "domain"
    :return: Array of bin edges
    """
    if method == "equal_width":
        return np.linspace(min(expected), max(expected), bins + 1)
    
    elif method == "equal_freq":
        return np.percentile(expected, np.linspace(0, 100, bins + 1))
    
    elif method == "adaptive":
        X = np.array(expected).reshape(-1, 1)
        y = np.digitize(expected, bins=np.percentile(expected, np.linspace(0, 100, bins)))
        tree = DecisionTreeClassifier(max_leaf_nodes=bins)
        tree.fit(X, y)
        return np.sort(tree.tree_.threshold[tree.tree_.threshold > 0])
    
    elif method == "kmeans":
        data = np.array(expected).reshape(-1, 1)
        kmeans = KMeans(n_clusters=bins, random_state=42).fit(data)
        return np.sort(kmeans.cluster_centers_.flatten())

    elif method == "domain":
        return bins  # User-defined domain-specific bins (must be passed as a list)
    
    else:
        raise ValueError("Invalid binning method! Choose from ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']")

def psi(expected, actual, bins=10, method="equal_width"):
    """
    Compute the Population Stability Index (PSI) and return details in a dictionary.

    :param expected: Reference (expected) distribution (numpy array or list)
    :param actual: New (actual) distribution (numpy array or list)
    :param bins: Number of bins or specific bin edges (if method='domain')
    :param method: Binning method - "equal_width", "equal_freq", "adaptive", "kmeans", "domain"
    :return: Dictionary with binning strategy, PSI DataFrame, and final PSI value
    """
    # Get bin edges based on the selected method
    bin_edges = get_bin_edges(expected, bins, method)
    
    # Ensure unique bin edges to avoid issues
    bin_edges = np.unique(bin_edges)

    # Compute histogram counts
    expected_counts, _ = np.histogram(expected, bins=bin_edges)
    actual_counts, _ = np.histogram(actual, bins=bin_edges)

    # Convert counts to proportions
    expected_perc = expected_counts / np.sum(expected_counts)
    actual_perc = actual_counts / np.sum(actual_counts)

    # Replace zero values to avoid division errors
    expected_perc = np.where(expected_perc == 0, 0.0001, expected_perc)
    actual_perc = np.where(actual_perc == 0, 0.0001, actual_perc)

    # Compute PSI for each bin
    psi_values = (expected_perc - actual_perc) * np.log(expected_perc / actual_perc)
    
    # Create DataFrame with bin details
    psi_df = pd.DataFrame({
        "Min Bin": bin_edges[:-1],
        "Max Bin": bin_edges[1:],
        "Expected Count": expected_counts,
        "Actual Count": actual_counts,
        "Expected %": expected_perc,
        "Actual %": actual_perc,
        "PSI Value": psi_values
    })

    # Final PSI value
    total_psi = np.sum(psi_values)

    # Return results as dictionary
    return {
        "Binning Strategy": method,
        "PSI DataFrame": psi_df,
        "Final PSI": total_psi
    }



In [8]:
# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(55, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="equal_freq")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


Binning Strategy: equal_freq
Final PSI: 0.2964673811401862
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  17.587327  37.552369             100            65         0.1  0.065130   
1  37.552369  41.965185             100            55         0.1  0.055110   
2  41.965185  44.766419             100            51         0.1  0.051102   
3  44.766419  47.593103             100            73         0.1  0.073146   
4  47.593103  50.253006             100            64         0.1  0.064128   
5  50.253006  52.486858             100            71         0.1  0.071142   
6  52.486858  55.139818             100           101         0.1  0.101202   
7  55.139818  58.135112             100            96         0.1  0.096192   
8  58.135112  63.056452             100           157         0.1  0.157315   
9  63.056452  88.527315             100           265         0.1  0.265531   

   PSI Value  
0   0.014951  
1   0.026747  
2   0.03282

In [9]:
# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(50, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="equal_freq")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


Binning Strategy: equal_freq
Final PSI: 0.04932731489765286
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  17.587327  37.552369             100           125         0.1  0.125755   
1  37.552369  41.965185             100            90         0.1  0.090543   
2  41.965185  44.766419             100            78         0.1  0.078471   
3  44.766419  47.593103             100            86         0.1  0.086519   
4  47.593103  50.253006             100            99         0.1  0.099598   
5  50.253006  52.486858             100            77         0.1  0.077465   
6  52.486858  55.139818             100            81         0.1  0.081489   
7  55.139818  58.135112             100            95         0.1  0.095573   
8  58.135112  63.056452             100           108         0.1  0.108652   
9  63.056452  88.527315             100           155         0.1  0.155936   

   PSI Value  
0   0.005902  
1   0.000939  
2   0.0052

In [11]:
# ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']

# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(55, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="equal_width")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


Binning Strategy: equal_width
Final PSI: 0.34182742190886634
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  17.587327  24.681325               4             9       0.004  0.009018   
1  24.681325  31.775324              22            14       0.022  0.014028   
2  31.775324  38.869323              96            57       0.096  0.057114   
3  38.869323  45.963322             228           117       0.228  0.117234   
4  45.963322  53.057321             272           208       0.272  0.208417   
5  53.057321  60.151320             226           238       0.226  0.238477   
6  60.151320  67.245318             104           184       0.104  0.184369   
7  67.245318  74.339317              38           106       0.038  0.106212   
8  74.339317  81.433316               9            52       0.009  0.052104   
9  81.433316  88.527315               1            13       0.001  0.013026   

   PSI Value  
0   0.004079  
1   0.003587  
2   0.020

In [None]:
# ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']

# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(50, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="equal_width")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


In [12]:
# ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']

# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(50, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="adaptive")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


Binning Strategy: adaptive
Final PSI: 0.08447124461224062
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  38.062449  42.712482             111           109       0.125  0.127188   
1  42.712482  45.488140             111            63       0.125  0.073512   
2  45.488140  48.921047             111           123       0.125  0.143524   
3  48.921047  51.686586             111            95       0.125  0.110852   
4  51.686586  54.066172             111            78       0.125  0.091015   
5  54.066172  57.237194             111           105       0.125  0.122520   
6  57.237194  61.920315             111           106       0.125  0.123687   
7  61.920315  84.658062             111           178       0.125  0.207701   

   PSI Value  
0   0.000038  
1   0.027333  
2   0.002560  
3   0.001699  
4   0.010783  
5   0.000050  
6   0.000014  
7   0.041995  


In [14]:
# ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']

# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(55, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="adaptive")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


Binning Strategy: adaptive
Final PSI: 0.30096346218498715
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  38.062449  42.712482             111            63       0.125  0.068330   
1  42.712482  45.488140             111            54       0.125  0.058568   
2  45.488140  48.921047             111            90       0.125  0.097614   
3  48.921047  51.686586             111            77       0.125  0.083514   
4  51.686586  54.066172             111            89       0.125  0.096529   
5  54.066172  57.237194             111           107       0.125  0.116052   
6  57.237194  61.920315             111           153       0.125  0.165944   
7  61.920315  84.658062             111           289       0.125  0.313449   

   PSI Value  
0   0.034227  
1   0.050363  
2   0.006772  
3   0.016731  
4   0.007359  
5   0.000665  
6   0.011601  
7   0.173246  


In [15]:
# ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']

# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(55, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="kmeans")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


  super()._check_params_vs_input(X, default_n_init=10)


Binning Strategy: kmeans
Final PSI: 0.2416422188253427
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  28.403008  35.681555              55            33    0.056065  0.035948   
1  35.681555  40.701484              99            56    0.100917  0.061002   
2  40.701484  44.613352             130            70    0.132518  0.076253   
3  44.613352  48.693490             143           106    0.145770  0.115468   
4  48.693490  52.353780             157            99    0.160041  0.107843   
5  52.353780  56.375785             151           144    0.153925  0.156863   
6  56.375785  60.587783             116           136    0.118247  0.148148   
7  60.587783  66.147951              84           145    0.085627  0.157952   
8  66.147951  73.969856              46           129    0.046891  0.140523   

   PSI Value  
0   0.008941  
1   0.020093  
2   0.031096  
3   0.007061  
4   0.020605  
5   0.000056  
6   0.006741  
7   0.044284  
8   

In [16]:
# ['equal_width', 'equal_freq', 'adaptive', 'kmeans', 'domain']

# Example Data
np.random.seed(42)
expected_data = np.random.normal(50, 10, 1000)  # Baseline distribution
actual_data = np.random.normal(50, 12, 1000)    # New distribution

# Run PSI calculation with Equal Frequency binning
psi_result = psi(expected_data, actual_data, bins=10, method="kmeans")

# Display results
print(f"Binning Strategy: {psi_result['Binning Strategy']}")
print(f"Final PSI: {psi_result['Final PSI']}")
print("PSI DataFrame:")
print(psi_result["PSI DataFrame"])


Binning Strategy: kmeans
Final PSI: 0.0421629553564869
PSI DataFrame:
     Min Bin    Max Bin  Expected Count  Actual Count  Expected %  Actual %  \
0  28.403008  35.681555              55            70    0.056065  0.074547   
1  35.681555  40.701484              99            89    0.100917  0.094782   
2  40.701484  44.613352             130           105    0.132518  0.111821   
3  44.613352  48.693490             143           133    0.145770  0.141640   
4  48.693490  52.353780             157           128    0.160041  0.136315   
5  52.353780  56.375785             151           127    0.153925  0.135250   
6  56.375785  60.587783             116           102    0.118247  0.108626   
7  60.587783  66.147951              84           116    0.085627  0.123536   
8  66.147951  73.969856              46            69    0.046891  0.073482   

   PSI Value  
0   0.005266  
1   0.000385  
2   0.003515  
3   0.000119  
4   0.003807  
5   0.002415  
6   0.000816  
7   0.013895  
8   

  super()._check_params_vs_input(X, default_n_init=10)
