## 前端监控-用户分布报告
在前端监控中，用户还没有出label，主要用来统计用户的分数分布，有监控放人策略，市场活动和模型计算bug的作用

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path

In [14]:
import warnings
warnings.filterwarnings("ignore")

### 载入数据
一共3个月的数据（202002，202003，202004），每个月10000人，分数都是0-100之间

In [4]:
fp_data = Path('/data/credit_model', 'lec6', 'test_dataset.csv')
pd_data = pd.read_csv(fp_data)

In [5]:
pd_data = pd_data[['userid', 'term', 'score']]

In [6]:
pd_data

Unnamed: 0,userid,term,score
0,39668738,202004,74.779712
1,59561190,202004,70.099734
2,160277597,202004,50.432712
3,7123385,202004,77.274795
4,95589182,202004,74.609002
...,...,...,...
29995,56894279,202002,66.514859
29996,172568266,202002,22.917540
29997,9087607,202002,76.696414
29998,148158746,202002,59.577740


### 做report用的函数

In [7]:
#分bin的函数
class Bin:

    @classmethod
    def bin_width(cls, xs, nbins, **kwargs):


        vmin = kwargs.get('vmin', None)
        vmax = kwargs.get('vmax', None)

        zs = np.array(xs, dtype=np.float)

        vmin = min(zs) if vmin is None else min(min(zs), vmin)
        vmax = max(zs) if vmax is None else max(max(zs), vmax)

        step = (vmax - vmin) / nbins

        ws = [vmin + k * step for k in range(nbins + 1)]
        ws[0] = min(ws[0], vmin)
        ws[0] -= 0.01 * ws[0]
        ws[-1] = max(ws[-1], vmax)
        ws[-1] += 0.01 * ws[-1]
        return ws

    @classmethod
    def cut(cls, xs, ws, **kwargs):

        zs = np.array(xs, dtype=np.float)

        buckets = pd.cut(zs, ws, right=False)
        return buckets



In [8]:
#计算PSI的函数
class PSI:

    @classmethod
    def bin_width(cls, expect, nbins, **kwargs):

        expect = np.array(expect, dtype=np.float)

        mask_nan = pd.isnull(expect) | np.isinf(expect)
        expect_val, expect_nan = expect[~mask_nan], expect[mask_nan]

        zs_val = expect_val

        ws = Bin.bin_width(expect_val, nbins, vmax=max(expect_val))
        buckets_val = pd.Series(Bin.cut(zs_val, ws))
        buckets_nan = pd.Series(['nan' if pd.isnull(x) else 'inf' for x in expect_nan])

        sr = pd.concat([buckets_val, buckets_nan], ignore_index=True)
        sr.name = 'expect'

        return sr, ws

    @classmethod
    def bin_cut(cls, actual, ws):
        if isinstance(ws, dict):
            actual = np.array(actual, dtype=str)
            sr = pd.Series([ws[z] for z in actual])
            sr.name = 'actual'
            return sr, ws
        else:
            actual = np.array(actual, dtype=np.float)

            mask_nan = pd.isnull(actual) | np.isinf(actual)
            actual_val, actual_nan = actual[~mask_nan], actual[mask_nan]

            buckets_val = pd.Series(Bin.cut(actual_val, ws))
            buckets_nan = pd.Series(['nan' if pd.isnull(x) else 'inf' for x in actual_nan])

            sr = pd.concat([buckets_val, buckets_nan], ignore_index=True)
            sr.name = 'actual'

            return sr, ws

    @classmethod
    def get_psi(cls, expect, actual, nbins, **kwargs):
        detail = kwargs.get('detail', False)
        bin_method = kwargs.get('bin_method', 'freq')

        if bin_method == 'width':
            sr_expect, ws = PSI.bin_width(expect, nbins, **kwargs)
        else:
            ws = kwargs.get('ws', None)
            if ws is None:
                raise ValueError('err, psi for category eature need to pass ws as dict')

        df = cls.combine_df(expect, actual, ws)

        stat = PSI.get_psi_detail(df)

        iv = stat['psi'].values[-1]

        if detail is False:
            return iv
        else:
            return stat

    @classmethod
    def get_psi_detail(cls, df, **kwargs):
        eps = np.finfo(float).eps

        stat_expect = df.loc[df['ys'] == 'expect', 'bucket'].value_counts()
        stat_expect.name = 'expect'
        stat_actual = df.loc[df['ys'] == 'actual', 'bucket'].value_counts()
        stat_actual.name = 'actual'
        stat = pd.concat([stat_expect, stat_actual], axis=1, sort=True)

        #计算PSI细节
        expect = stat['expect'].sum()
        actual = stat['actual'].sum()
        stat['expect_attr'] = stat['expect'] / expect
        stat['actual_attr'] = stat['actual'] / actual

        #计算两个分布的woe
        stat['woe'] = np.log((stat['actual_attr'] + eps) / (stat['expect_attr'] + eps))
        
        #计算PSI
        stat['psi'] = (stat['actual_attr'] - stat['expect_attr']) * stat['woe']

        psi = stat['psi'].sum()
        stat['bucket'] = stat.index

        summary = pd.Series({'expect': expect, 'actual': actual, 'psi': psi, 'bucket': 'total'},
                            name='summary')
        stat = stat.append(summary, ignore_index=True)
        stat.index = stat['bucket']

        return stat

    @classmethod
    def combine_df(cls, expect, actual, ws):
        sr_expect, ws = PSI.bin_cut(expect, ws)
        sr_actual, ws = PSI.bin_cut(actual, ws)

        df_expect = pd.DataFrame({'xs': expect, 'bucket': sr_expect})
        df_expect['ys'] = 'expect'

        df_actual = pd.DataFrame({'xs': actual, 'bucket': sr_actual})
        df_actual['ys'] = 'actual'

        df = pd.concat([df_expect, df_actual], axis=0, ignore_index=True)

        return df



In [9]:
#计算cumulative user
def get_cumulative_user(df, col_name):
    total = df[col_name].sum()
    
    #将user的分数分布反转
#     eg：bin   user num
#         0-10     5
#         10-20   10
#         20-30   10
    
    #变成：
#           bin    user num
#           20-30    10
#           10-20    10
#           0-10     5

    
    df_dist_de = df.sort_index(ascending=False)
    cumul_percent = 0.0
    cumul_user = 0
    lst_cumul_perc = list()
    lst_cumul_user = list()
    lst_perc = list()
    
    #使用for循环，从20-30分数段开始计算累计值
    #当前累计人数 = 上一个分数段的累计人数 + 当前分数段人数
    #当前累计百分比 = 当前累计人数 / 总人数
    for bin_name, row in df_dist_de.iterrows():
        if len(lst_cumul_perc) == 0:
            cumul_num = row[col_name]
            cumul_percent = cumul_num / total
            lst_cumul_perc.append(cumul_percent)
            cumul_user = cumul_num
            lst_cumul_user.append(cumul_user)
        else:
            cumul_num = row[col_name]
            cumul_percent = cumul_num / total + cumul_percent
            lst_cumul_perc.append(cumul_percent)
            cumul_user = cumul_num + cumul_user
            lst_cumul_user.append(cumul_user)
        perc = cumul_num / total
        lst_perc.append(perc)
        
    #再进行反转，变成：
#     bin     cum_user   cum_perc
#     0-10      25          25 / 25 = 100%
#     10-20     20          20 / 25 = 80%
#     20-30     10          10 /25 
    lst_cumul_perc.reverse()
    lst_perc.reverse()
    lst_cumul_user.reverse()
    return lst_cumul_perc, lst_perc, lst_cumul_user

In [10]:
def get_distribution_table(df, tag_name, lst_bin):
    df_tmp = df.copy()
    
    #calibration分数时可能会有bug，计算时会有bug，超出界限的分数
    df_tmp.loc[df_tmp['score'] <= 0, 'score'] = 0.1
    df_tmp.loc[df_tmp['score'] >= 100, 'score'] = 99.9
    
    #分bin，计算每个分数bin上的人数
    df_tmp['bin'] = pd.cut(df_tmp['score'], lst_bin)
    df_tmp = df_tmp.groupby('bin').count()
    df_tmp = df_tmp[['score']]
    df_tmp.columns = [f'users_{tag_name}']

    #调用计算cumulative user的function，返回 user num，user perc，cumulative user num，cumulative user perc
    lst_cumul_perc, lst_perc, lst_cumul_user = get_cumulative_user(df_tmp, f'users_{tag_name}')
    df_tmp[f'{tag_name}_cumulative_users'] = lst_cumul_user
    df_tmp[f'{tag_name}_users_percentage'] = lst_perc
    df_tmp[f'{tag_name}_cumulative_percentage'] = lst_cumul_perc

    return df_tmp

In [11]:
#生成前端监控report的主要函数
def generate_cumulative_distribution_report(df_pd, term_col_name, lst_term, score_cut, lst_bin):
    df_rep = pd.DataFrame()
    
    #定义除了分数段bin的其他行
    lst_above_perc = [f'above {score_cut}']
    lst_below_perc = [f'below {score_cut}']
    lst_user_sum = ['total user #']
    lst_mean_score = ['mean score']
    lst_psi = ['PSI']
    lst_ecdf = ['KS']
    
    #分term计算每一列的值
    for term in lst_term:
        
        #计算cumulative user perc
        df_tmp = df_pd[df_pd[term_col_name] == term].copy()
        df_rep_tmp = get_distribution_table(df_tmp, term, lst_bin)
        df_rep[term] = df_rep_tmp[f'{term}_cumulative_percentage']

        #计算总人数
        user_sum = df_tmp['userid'].count()
        lst_user_sum.append(user_sum)

        #计算某个分数以上的人数百分比
        above_score_cut = df_tmp[df_tmp['score'] > score_cut]['userid'].count() / user_sum
        lst_above_perc.append(above_score_cut)

        #计算某个分数以下的人数百分比
        below_score_cut = df_tmp[df_tmp['score'] < score_cut]['userid'].count() / user_sum
        lst_below_perc.append(below_score_cut)

        #计算分数平均值
        mean_score = df_tmp['score'].mean()
        lst_mean_score.append(mean_score)

        #计算PSI
        psi = PSI.get_psi(df_pd[df_pd[term_col_name] == lst_term[0]]['score'], df_tmp['score'], 20, bin_method='width')
        lst_psi.append(psi)

        #计算KS
        max_diff = max(abs(df_rep[lst_term[0]] - df_rep[term]))
        lst_ecdf.append(max_diff)

    #将所有的数据进行合并，形成一个report
    #注意：这里一定要reindex，不然后面添加行的时候不能自定义行的名字
    df_rep = df_rep.reset_index()
    
    df_rep.loc[len(df_rep)] = lst_above_perc
    df_rep.loc[len(df_rep)] = lst_below_perc

    #将所有小数转换为百分数
    for term in lst_term:
        df_rep[term] = df_rep[term].apply(lambda x: format(x, '.2%'))

    df_rep.loc[len(df_rep)] = lst_user_sum
    df_rep.loc[len(df_rep)] = lst_mean_score
    df_rep.loc[len(df_rep)] = lst_psi
    df_rep.loc[len(df_rep)] = lst_ecdf

    return df_rep

### 生成用户分布报告

In [12]:
term_col_name = 'term'
lst_term = [202002, 202003, 202004]
score_cut = 60
lst_bin = [0, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 100]

In [15]:
generate_cumulative_distribution_report(pd_data, term_col_name, lst_term, score_cut, lst_bin)

Unnamed: 0,index,202002,202003,202004
0,"(0, 30]",100.00%,100.00%,100.00%
1,"(30, 35]",99.53%,99.47%,99.44%
2,"(35, 40]",98.78%,98.77%,98.76%
3,"(40, 45]",97.70%,97.87%,97.69%
4,"(45, 50]",96.58%,96.75%,96.70%
5,"(50, 55]",94.83%,95.04%,94.88%
6,"(55, 60]",91.33%,91.53%,91.69%
7,"(60, 65]",85.57%,85.61%,86.35%
8,"(65, 70]",76.05%,76.24%,76.93%
9,"(70, 75]",61.37%,61.74%,61.58%
