In [1]:
import numpy as np
import pandas as pd


def PSI(expect, actual, bin_nums=10, return_dict=True, unique_threshold=10, dropna=True):
    def _psi(a, e):
        if np.sum(a == 0) > 0:
            print('actual data contains zero !! return 1')
            return 1
        return np.sum((a - e) * np.log(a / e))

    def _fillna_cat(t, dropna):
        t0 = pd.isnull(t[0]).sum()
        t1 = pd.isnull(t[1]).sum()
        if t0 > 0 or t1 > 0:
            print('Nan statistics: \n {}'.format(t0 + t1))
            if dropna:
                print('    Drop Nan !!')
            else:
                print('Replace Nan with min/10.0 !!')
                t.fillna(t.min().min() / 10.0, inplace=True)
        return t

    def _fillna_cont(t1, t2, dropna):
        tt1 = t1.isna().sum()
        tt2 = t2.isna().sum()

        if tt1 > 0 or tt2 > 0:
            if (dropna):
                t1 = t1.dropna()
                t2 = t2.dropna()
            else:
                fillvalue = np.min(t1.min(), t2.min()) - 1
                t1.fillna(fillvalue, inplace=True)
                t2.fillna(fillvalue, inplace=True)
        return t1, t2

    def _bin_format(b):
        b = np.unique(b)
        b[0] = -np.inf
        b[-1] = +np.inf
        return b

    if len(np.unique(expect)) < unique_threshold:
        e_pct = expect.value_counts() / len(expect)
        a_pct = actual.value_counts() / len(actual)

        e_pct = e_pct.sort_index()
        a_pct = a_pct.sort_index()

        t = pd.concat([e_pct, a_pct], axis=1)
        t.columns = [0, 1]
        t = _fillna_cat(t, dropna)
        e_pct, a_pct = t[0], t[1]
    else:
        expect, actual = _fillna_cont(expect, actual, dropna)
        bins = np.percentile(expect, [(100.0 / bin_nums) * i for i in range(bin_nums + 1)], interpolation='nearest')
        bins = _bin_format(bins)
        e_pct = (pd.cut(expect, bins=bins, include_lowest=True).value_counts()) / len(expect)
        a_pct = (pd.cut(actual, bins=bins, include_lowest=True).value_counts()) / len(actual)
        a_pct = a_pct.sort_index()
        e_pct = e_pct.sort_index()

    p = _psi(a_pct, e_pct)
    if return_dict:
        results = pd.DataFrame({
            'expect_pct': e_pct.values,
            'actual_pct': a_pct.values
        }, index=e_pct.index)
        return {'data': results, 'statistic': p}
    return p

In [2]:
rdm = np.random.RandomState(42)
e = rdm.normal(size=100)
a = rdm.normal(size=100)
PSI(pd.Series(e), pd.Series(a))

{'data':                   expect_pct  actual_pct
 (-inf, -1.328]          0.11        0.06
 (-1.328, -0.72]         0.10        0.22
 (-0.72, -0.502]         0.10        0.04
 (-0.502, -0.301]        0.10        0.04
 (-0.301, -0.116]        0.10        0.05
 (-0.116, 0.111]         0.09        0.10
 (0.111, 0.331]          0.10        0.14
 (0.331, 0.648]          0.10        0.12
 (0.648, 1.004]          0.10        0.11
 (1.004, inf]            0.10        0.12,
 'statistic': 0.2922923789663523}

In [3]:
def PSI_DF(df_exp, df_act, bin_nums=10):
    cols = df_exp.columns.tolist()
    col_name = []
    psis = []
    for cc in cols:
        psis.append(PSI(df_exp[cc], df_act[cc], bin_nums=bin_nums, return_dict=False))
        col_name.append(cc)
    return pd.DataFrame({'column': col_name, 'psi': psis})

In [4]:
e1 = rdm.normal(size=100)
a1 = rdm.normal(size=100)
e2 = rdm.normal(size=100)
a2 = rdm.normal(size=100)
e = pd.DataFrame({'col1': e1, 'col2': e2})
a = pd.DataFrame({'col1': a1, 'col2': a2})
PSI_DF(e, a)

Unnamed: 0,column,psi
0,col1,0.184779
1,col2,0.381491
