In [None]:
# ! pip3 install bytedtqs 

In [1]:
from itertools import combinations
from scipy.special import factorial, comb
import pandas as pd
import numpy as np
import random
from IPython.display import display, HTML
from datetime import date, timedelta

import bytedtqs

## 分析配置

生成默认日期，注意如果要分析 Dx 转化，日期需要自己手动配置。

In [10]:
# app name
APP_NAME = 'eo'

# common dates
YESTERDAY = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
DOD = (date.today() - timedelta(days=2)).strftime('%Y-%m-%d')
WOW = (date.today() - timedelta(days=8)).strftime('%Y-%m-%d')

# compare dates
DATE_BASE = [WOW, WOW]
DATE_CURR = [YESTERDAY, YESTERDAY]

## 数据源配置

**提前为分析场景配置的，一般不需要修改。**

默认一次加载过去 DAYS_BACK 的数据，减少 DB 读取，后续都通过 Python 处理。

In [2]:
# date range for raw data
DAYS_BACK = 90

data_date_start = date.today() - timedelta(days=DAYS_BACK)
data_date_end = date.today() - timedelta(days=1)

start_date = data_date_start.strftime('%Y-%m-%d')
end_date = data_date_end.strftime('%Y-%m-%d')

每日分维度的基础指标的 SQL。**注意：维度如果包含 NULL 值，pandas 聚合结果就不准群。**

In [83]:
# sql for raw data
sql = """
set tqs.query.engine.type=sparkcli;

select
  install_date
  ,coalesce(app_name,'unknown') as app_name
  ,coalesce(os,'unknown') as os
  ,case when channel_user_name in ('toutiao_promote','toutiaodsp_new') then '内广'     
    when channel_user_name in ('store_appstore','AppStore') then 'Apple Store'  
    when channel_user_name in ('googleadwords_int','store_google','google','Facebook') then 'Google-FB'     
    when channel_user_name in ('huawei_id','huaweipps_id','huaweiywjj_id','xiaomi_id') then '华为小米'      
    when channel_user_name = 'oppo_id' then 'Oppo'
    when channel_user_name = 'vivo_id' then 'Vivo'
    when channel_user_name = 'guangdiantong' then '广点通' 
    else '其他' end as channel
  ,coalesce(occupation,'unknown') as occupation
  ,coalesce(first_level_name,'unknown') as first_level_name
  ,coalesce(first_milestone_name,'unknown') as first_milestone_name
  ,coalesce(edu,'unknown') as edu
  ,coalesce(age,'unknown') as age
  ,coalesce(city_level,'unknown') as city_level
  ,coalesce(career,'unknown') as career
  ,sum(dnu_did) as dnu
  ,sum(d0_copy_wx) as d0_copy_wx
  ,sum(d3_copy_wx) as d3_copy_wx
  ,sum(d5_copy_wx) as d5_copy_wx
  ,sum(d8_copy_wx) as d8_copy_wx
  ,sum(d0_enter_camp) as d0_enter_camp
  ,sum(d3_enter_camp) as d3_enter_camp
  ,sum(d5_enter_camp) as d5_enter_camp
  ,sum(d8_enter_camp) as d8_enter_camp
  ,sum(d0_entered_wechat_group) as d0_enter_group
  ,sum(d3_entered_wechat_group) as d3_enter_group
  ,sum(d5_entered_wechat_group) as d5_enter_group
  ,sum(d8_entered_wechat_group) as d8_enter_group
  ,sum(d0_study_10_mins) as d0_study
  ,sum(d3_study_10_mins) as d3_study
  ,sum(d5_study_10_mins) as d5_study
  ,sum(d8_study_10_mins) as d8_study
  ,sum(d0_in_wechat_group_total_cnt) as d0_group_order
  ,sum(d3_in_wechat_group_total_cnt) as d3_group_order
  ,sum(d5_in_wechat_group_total_cnt) as d5_group_order
  ,sum(d8_in_wechat_group_total_cnt) as d8_group_order
  ,sum(d0_in_wechat_group_order_revenue/100) as d0_group_revenue
  ,sum(d3_in_wechat_group_order_revenue/100) as d3_group_revenue
  ,sum(d5_in_wechat_group_order_revenue/100) as d5_group_revenue
  ,sum(d8_in_wechat_group_order_revenue/100) as d8_group_revenue
  ,sum(d0_out_wechat_group_total_cnt) as d0_out_group_order
  ,sum(d3_out_wechat_group_total_cnt) as d3_out_group_order
  ,sum(d5_out_wechat_group_total_cnt) as d5_out_group_order
  ,sum(d8_out_wechat_group_total_cnt) as d8_out_group_order
  ,sum(d0_out_wechat_group_order_revenue/100) as d0_out_group_revenue
  ,sum(d3_out_wechat_group_order_revenue/100) as d3_out_group_revenue
  ,sum(d5_out_wechat_group_order_revenue/100) as d5_out_group_revenue
  ,sum(d8_out_wechat_group_order_revenue/100) as d8_out_group_revenue
from dm_ky.app_newer_roi_df
where date = date_format(date_add(current_date,-1),'yyyyMMdd')
  and install_date between '2020-08-11' and '2020-11-08'
group by install_date
  ,coalesce(app_name,'unknown')
  ,coalesce(os,'unknown')
  ,case when channel_user_name in ('toutiao_promote','toutiaodsp_new') then '内广'     
    when channel_user_name in ('store_appstore','AppStore') then 'Apple Store'  
    when channel_user_name in ('googleadwords_int','store_google','google','Facebook') then 'Google-FB'     
    when channel_user_name in ('huawei_id','huaweipps_id','huaweiywjj_id','xiaomi_id') then '华为小米'      
    when channel_user_name = 'oppo_id' then 'Oppo'
    when channel_user_name = 'vivo_id' then 'Vivo'
    when channel_user_name = 'guangdiantong' then '广点通' 
    else '其他' end
  ,coalesce(occupation,'unknown')
  ,coalesce(first_level_name,'unknown')
  ,coalesce(first_milestone_name,'unknown')
  ,coalesce(edu,'unknown')
  ,coalesce(age,'unknown')
  ,coalesce(city_level,'unknown')
  ,coalesce(career,'unknown')
"""

后续按不同日期和维度在 Python 内聚合，提前写好聚合函数。

In [91]:
def custom_aggregate(x):
    s = {
        'dnu': x['dnu'].sum(),
        'd0_copy_wx': x['d0_copy_wx'].sum(),
        'd3_copy_wx': x['d3_copy_wx'].sum(),
        'd5_copy_wx': x['d5_copy_wx'].sum(),
        'd8_copy_wx': x['d8_copy_wx'].sum(),
        
        'd0_enter_camp': x['d0_enter_camp'].sum(),
        'd3_enter_camp': x['d3_enter_camp'].sum(),
        'd5_enter_camp': x['d5_enter_camp'].sum(),
        'd8_enter_camp': x['d8_enter_camp'].sum(),
        
        'd0_enter_group': x['d0_enter_group'].sum(),
        'd3_enter_group': x['d3_enter_group'].sum(),
        'd5_enter_group': x['d5_enter_group'].sum(),
        'd8_enter_group': x['d8_enter_group'].sum(),
        
        'd0_study': x['d0_study'].sum(),
        'd3_study': x['d3_study'].sum(),
        'd5_study': x['d5_study'].sum(),
        'd8_study': x['d8_study'].sum(),
        
        'd0_group_order': x['d0_group_order'].sum(),
        'd3_group_order': x['d3_group_order'].sum(),
        'd5_group_order': x['d5_group_order'].sum(),
        'd8_group_order': x['d8_group_order'].sum(),
        
        'd0_group_revenue': x['d0_group_revenue'].sum(),
        'd3_group_revenue': x['d3_group_revenue'].sum(),
        'd5_group_revenue': x['d5_group_revenue'].sum(),
        'd8_group_revenue': x['d8_group_revenue'].sum(),
        
        'd0_out_group_order': x['d0_out_group_order'].sum(),
        'd3_out_group_order': x['d3_out_group_order'].sum(),
        'd5_out_group_order': x['d5_out_group_order'].sum(),
        'd8_out_group_order': x['d8_out_group_order'].sum(),
        
        'd0_out_group_revenue': x['d0_out_group_revenue'].sum(),
        'd3_out_group_revenue': x['d3_out_group_revenue'].sum(),
        'd5_out_group_revenue': x['d5_out_group_revenue'].sum(),
        'd8_out_group_revenue': x['d8_out_group_revenue'].sum(),

        'd0_copy_to_dnu': x['d0_copy_wx'].sum()/x['dnu'].sum(),
        'd3_copy_to_dnu': x['d3_copy_wx'].sum()/x['dnu'].sum(),
        'd5_copy_to_dnu': x['d5_copy_wx'].sum()/x['dnu'].sum(),
        'd8_copy_to_dnu': x['d8_copy_wx'].sum()/x['dnu'].sum(),
        
        'd0_camp_to_dnu': x['d0_enter_camp'].sum()/x['dnu'].sum(),
        'd3_camp_to_dnu': x['d3_enter_camp'].sum()/x['dnu'].sum(),
        'd5_camp_to_dnu': x['d5_enter_camp'].sum()/x['dnu'].sum(),
        'd8_camp_to_dnu': x['d8_enter_camp'].sum()/x['dnu'].sum(),
        
        'd0_group_to_dnu': x['d0_enter_group'].sum()/x['dnu'].sum(),
        'd3_group_to_dnu': x['d3_enter_group'].sum()/x['dnu'].sum(),
        'd5_group_to_dnu': x['d5_enter_group'].sum()/x['dnu'].sum(),
        'd8_group_to_dnu': x['d8_enter_group'].sum()/x['dnu'].sum(),

        'd0_camp_to_copy': 0 if x['d0_copy_wx'].sum() == 0 else x['d0_enter_camp'].sum()/x['d0_copy_wx'].sum(),
        'd3_camp_to_copy': 0 if x['d3_copy_wx'].sum() == 0 else x['d3_enter_camp'].sum()/x['d3_copy_wx'].sum(),
        'd5_camp_to_copy': 0 if x['d5_copy_wx'].sum() == 0 else x['d5_enter_camp'].sum()/x['d5_copy_wx'].sum(),
        'd8_camp_to_copy': 0 if x['d8_copy_wx'].sum() == 0 else x['d8_enter_camp'].sum()/x['d8_copy_wx'].sum(),

        'd0_group_to_camp': 0 if x['d0_enter_camp'].sum() == 0 else x['d0_enter_group'].sum()/x['d0_enter_camp'].sum(),
        'd3_group_to_camp': 0 if x['d3_enter_camp'].sum() == 0 else x['d3_enter_group'].sum()/x['d3_enter_camp'].sum(),
        'd5_group_to_camp': 0 if x['d5_enter_camp'].sum() == 0 else x['d5_enter_group'].sum()/x['d5_enter_camp'].sum(),
        'd8_group_to_camp': 0 if x['d8_enter_camp'].sum() == 0 else x['d8_enter_group'].sum()/x['d8_enter_camp'].sum(),

        'd0_rev_per_group': 0 if x['d0_enter_group'].sum() == 0 else x['d0_group_revenue'].sum()/x['d0_enter_group'].sum(),
        'd3_rev_per_group': 0 if x['d3_enter_group'].sum() == 0 else x['d3_group_revenue'].sum()/x['d3_enter_group'].sum(),
        'd5_rev_per_group': 0 if x['d5_enter_group'].sum() == 0 else x['d5_group_revenue'].sum()/x['d5_enter_group'].sum(),
        'd8_rev_per_group': 0 if x['d8_enter_group'].sum() == 0 else x['d8_group_revenue'].sum()/x['d8_enter_group'].sum(),

        'd0_rev_per_dnu': x['d0_group_revenue'].sum()/x['dnu'].sum(),
        'd3_rev_per_dnu': x['d3_group_revenue'].sum()/x['dnu'].sum(),
        'd5_rev_per_dnu': x['d5_group_revenue'].sum()/x['dnu'].sum(),
        'd8_rev_per_dnu': x['d8_group_revenue'].sum()/x['dnu'].sum(),

        'd0_order_per_group': 0 if x['d0_enter_group'].sum() == 0 else x['d0_group_order'].sum()/x['d0_enter_group'].sum(),
        'd3_order_per_group': 0 if x['d3_enter_group'].sum() == 0 else x['d3_group_order'].sum()/x['d3_enter_group'].sum(),
        'd5_order_per_group': 0 if x['d5_enter_group'].sum() == 0 else x['d5_group_order'].sum()/x['d5_enter_group'].sum(),
        'd8_order_per_group': 0 if x['d8_enter_group'].sum() == 0 else x['d8_group_order'].sum()/x['d8_enter_group'].sum(),

        'd0_order_per_dnu': x['d0_group_order'].sum()/x['dnu'].sum(),
        'd3_order_per_dnu': x['d3_group_order'].sum()/x['dnu'].sum(),
        'd5_order_per_dnu': x['d5_group_order'].sum()/x['dnu'].sum(),
        'd8_order_per_dnu': x['d8_group_order'].sum()/x['dnu'].sum(),

        'd0_off_group_order_per_user': 0 if (x['dnu'].sum() - x['d0_enter_group'].sum()) == 0 else x['d0_out_group_order'].sum()/(x['dnu'].sum() - x['d0_enter_group'].sum()),
        'd3_off_group_order_per_user': 0 if (x['dnu'].sum() - x['d3_enter_group'].sum()) == 0 else x['d3_out_group_order'].sum()/(x['dnu'].sum() - x['d3_enter_group'].sum()),
        'd5_off_group_order_per_user': 0 if (x['dnu'].sum() - x['d5_enter_group'].sum()) == 0 else x['d5_out_group_order'].sum()/(x['dnu'].sum() - x['d5_enter_group'].sum()),
        'd8_off_group_order_per_user': 0 if (x['dnu'].sum() - x['d8_enter_group'].sum()) == 0 else x['d8_out_group_order'].sum()/(x['dnu'].sum() - x['d8_enter_group'].sum()),
    
        'd0_off_group_rev_per_user': 0 if (x['dnu'].sum() - x['d0_enter_group'].sum()) == 0 else x['d0_out_group_revenue'].sum()/(x['dnu'].sum() - x['d0_enter_group'].sum()),
        'd3_off_group_rev_per_user': 0 if (x['dnu'].sum() - x['d3_enter_group'].sum()) == 0 else x['d3_out_group_revenue'].sum()/(x['dnu'].sum() - x['d3_enter_group'].sum()),
        'd5_off_group_rev_per_user': 0 if (x['dnu'].sum() - x['d5_enter_group'].sum()) == 0 else x['d5_out_group_revenue'].sum()/(x['dnu'].sum() - x['d5_enter_group'].sum()),
        'd8_off_group_rev_per_user': 0 if (x['dnu'].sum() - x['d8_enter_group'].sum()) == 0 else x['d8_out_group_revenue'].sum()/(x['dnu'].sum() - x['d8_enter_group'].sum()),
    }
    
    return pd.Series(s, index=s.keys())


def custom_results(df, dim_str_list, date_range_list):
    temp = df[lambda x: x['install_date'].between(*date_range_list)].\
        groupby(dim_str_list).apply(custom_aggregate).reset_index()
    temp['dnu_sum'] = temp['dnu'].sum()
    temp['d0_copy_sum'] = temp['d0_copy_wx'].sum()
    temp['d3_copy_sum'] = temp['d3_copy_wx'].sum()
    temp['d5_copy_sum'] = temp['d5_copy_wx'].sum()
    temp['d8_copy_sum'] = temp['d8_copy_wx'].sum()
    temp['d0_camp_sum'] = temp['d0_enter_camp'].sum()
    temp['d3_camp_sum'] = temp['d3_enter_camp'].sum()
    temp['d5_camp_sum'] = temp['d5_enter_camp'].sum()
    temp['d8_camp_sum'] = temp['d8_enter_camp'].sum()
    temp['d0_group_sum'] = temp['d0_enter_group'].sum()
    temp['d3_group_sum'] = temp['d3_enter_group'].sum()
    temp['d5_group_sum'] = temp['d5_enter_group'].sum()
    temp['d8_group_sum'] = temp['d8_enter_group'].sum()
    temp['dnu_ratio'] = temp['dnu']/temp['dnu_sum']
    temp['d0_copy_ratio'] = np.where(temp['d0_copy_sum'] == 0, 0, temp['d0_copy_wx']/temp['d0_copy_sum'])
    temp['d3_copy_ratio'] = np.where(temp['d3_copy_sum'] == 0, 0, temp['d3_copy_wx']/temp['d3_copy_sum'])
    temp['d5_copy_ratio'] = np.where(temp['d5_copy_sum'] == 0, 0, temp['d5_copy_wx']/temp['d5_copy_sum'])
    temp['d8_copy_ratio'] = np.where(temp['d8_copy_sum'] == 0, 0, temp['d8_copy_wx']/temp['d8_copy_sum'])
    temp['d0_camp_ratio'] = np.where(temp['d0_camp_sum'] == 0, 0, temp['d0_enter_camp']/temp['d0_camp_sum'])
    temp['d3_camp_ratio'] = np.where(temp['d3_camp_sum'] == 0, 0, temp['d3_enter_camp']/temp['d3_camp_sum'])
    temp['d5_camp_ratio'] = np.where(temp['d5_camp_sum'] == 0, 0, temp['d5_enter_camp']/temp['d5_camp_sum'])
    temp['d8_camp_ratio'] = np.where(temp['d8_camp_sum'] == 0, 0, temp['d8_enter_camp']/temp['d8_camp_sum'])
    temp['d0_group_ratio'] = np.where(temp['d0_group_sum'] == 0, 0, temp['d0_enter_group']/temp['d0_group_sum'])
    temp['d3_group_ratio'] = np.where(temp['d3_group_sum'] == 0, 0, temp['d3_enter_group']/temp['d3_group_sum'])
    temp['d5_group_ratio'] = np.where(temp['d5_group_sum'] == 0, 0, temp['d5_enter_group']/temp['d5_group_sum'])
    temp['d8_group_ratio'] = np.where(temp['d8_group_sum'] == 0, 0, temp['d8_enter_group']/temp['d8_group_sum'])
    return temp

开始查询 DB。

In [84]:
# generate sql
sql_w_param = sql.format(**{'start_date': start_date, 'end_date': end_date})

# refresh client
app_id = 'lFKW9WPzA2tHT7Bv3HuNH2UnIonYG75hnWR6maHVo7YYIXqm'
app_key = 'wTmX8lGFWeFFgROnTzOgb9uIrzrTLeDTtPil0LADDYeOayQo'
user_name = 'wufei.97'

# run sql
client = bytedtqs.TQSClient(app_id=app_id, app_key=app_key)
hive_job = client.execute_query(user_name=user_name, query=sql_w_param)

# result file url
hive_results = hive_job.get_result().result_url

[2020-11-09 19:45:41,448] - [INFO] - job submitted, job_id: 151578212
[2020-11-09 19:45:41,472] - [INFO] - job_id: 151578212, engine_type: Hive, status: Created
[2020-11-09 19:45:43,502] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Scheduled
[2020-11-09 19:45:45,527] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:47,560] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:49,587] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:51,626] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:53,664] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:55,685] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:57,709] - [INFO] - job_id: 151578212, engine_type: SparkCli, status: Processing
[2020-11-09 19:45:59,736] - [INFO] - job_id: 151578212, 

In [85]:
# fetch results & filter app name
# 如果读取报错，可能是
df_raw = pd.read_csv(hive_results)[lambda x: x['app_name']==APP_NAME]

Shapley Value 分析算法与结果展示。

In [96]:
class ShapAnalysis:
    
    def __init__(self, df, date_base, date_curr, dim, var, metric, weight, func):
        self.df = df
        self.date_base = date_base
        self.date_curr = date_curr
        self.dim = dim
        self.var = var
        self.metric = metric
        self.weight = weight
        self.func = func
        # placeholders
        self.metric_base = 0
        self.metric_curr = 0
        self.metric_delta = 0
    
    def process_data(self):
        # generate df
        df_base = custom_results(self.df, self.dim, self.date_base)
        df_curr = custom_results(self.df, self.dim, self.date_curr)

        # fill NA
        df_base[self.dim] = df_base[self.dim].astype(str).fillna('_')
        df_curr[self.dim] = df_curr[self.dim].astype(str).fillna('_')
        df_base[self.var] = df_base[self.var].fillna(0)
        df_curr[self.var] = df_curr[self.var].fillna(0)

        # select required columns
        df_base = df_base[self.dim + self.var]
        df_curr = df_curr[self.dim + self.var]

        # combine dimensions into a tuple
        self.new_dim_col = '_dim'
        df_base[self.new_dim_col] = df_base[self.dim].apply(tuple, axis=1)
        df_curr[self.new_dim_col] = df_curr[self.dim].apply(tuple, axis=1)

        # drop old dim cols
        df_base, df_curr = df_base.drop(self.dim, axis=1), df_curr.drop(self.dim, axis=1)

        # find the set of all dim values
        self.dim_uniq = pd.concat([df_base[self.new_dim_col], df_curr[self.new_dim_col]]).unique()

        # make sure both dataframes have records for all dim values
        for d in self.dim_uniq:
            new_row = dict()
            new_row[self.new_dim_col] = d
            for v in self.var:
                new_row[v] = 0
            # tuple in set
            if d not in set(df_base[self.new_dim_col].values):
                df_base = df_base.append(new_row, ignore_index=True)
            if d not in set(df_curr[self.new_dim_col].values):
                df_curr = df_curr.append(new_row, ignore_index=True)
                
        self.df_base = df_base
        self.df_curr = df_curr

        
    def analysis(self, sample_size=2):
        # players: dim x variable
        players = [(i, j) for i in range(len(self.dim_uniq)) for j in range(len(self.var))]

        # sample
        N = len(players)
        sample_size = min(sample_size, factorial(N))
        seq_list = list()
        random.seed(666)
        
        for _ in range(sample_size):
            seq = list(range(N))
            random.shuffle(seq)
            seq_list.append(seq)
            
        self.phi = dict()
        
        # reuse the same set of sequences for all players        
        for seq in seq_list:
            # make of copy of ctl
            df_s = self.df_base.copy()
            # current utility
            v_current = self.func(df_s)

            for i in range(N):
                # select player p
                p = players[seq[i]]
                # select dim and variable
                d, v = self.dim_uniq[p[0]], self.var[p[1]]
                # update df_s
                df_s.loc[lambda x: x[self.new_dim_col]==d, v] = \
                    self.df_curr.loc[lambda x: x[self.new_dim_col]==d, v].values
                # calculate marginal utility
                v_si = self.func(df_s)
                phi_i = v_si - v_current
                # update current utility
                v_current = v_si

                # add utility for player p
                if p in self.phi:
                    self.phi[p] += phi_i
                else:
                    self.phi[p] = phi_i
        
    
    def process_results(self):
        # overall metrics
        self.metric_base, self.metric_curr = \
            self.func(self.df_base), self.func(self.df_curr)
        self.metric_delta = self.metric_curr - self.metric_base

        # standardize (because of sampling)
        phi_std = {k:1.0*self.metric_delta*v/sum(self.phi.values()) for k, v in self.phi.items()}

        # save contribution of each player
        self.con = \
        [{'维度':self.dim_uniq[k[0]], 
          '变量':self.var[k[1]],
          '贡献': v,
         } for k, v in phi_std.items()]

            
    def display_contribution(self):
        # contribution by dim combination
        con_by_dim = pd.DataFrame(self.con)
        
        # contribution by var
        con_by_var = con_by_dim.\
            groupby('变量')['贡献'].sum().reset_index()
        con_by_var['贡献权重'] = con_by_var['贡献']/self.metric_delta
        print("每个变量的整体贡献:")
        display(
            con_by_var.style.hide_index().\
            background_gradient(
                subset=pd.IndexSlice[:,['贡献权重']],
                # cmap='plasma',
                cmap='viridis',
                    ).\
            format({
                    '贡献':'{:.4f}',
                    '贡献权重':'{:.2%}',
                })
        )
        
        # split dim tuple into separate dims
        df_con_split = pd.concat(
            [
                pd.DataFrame(
                    con_by_dim['维度'].tolist(), 
                    columns=self.dim
                ), 
                con_by_dim
            ], 
            axis=1,
        )
        print("每个变量贡献，按不同维度分解:")
        
        for d in self.dim:
            print(f"\n{'-'*10} 变量: {d}")
            # aggregate by d
            df_base_d = custom_results(self.df, d, self.date_base)
            df_curr_d = custom_results(self.df, d, self.date_curr)

            for v in self.var:
                print(f"{'-'*2} 指标: {v}. 累计贡献 {con_by_var[lambda x: x['变量']==v]['贡献'].values[0] :.4f}")
                df_con_grouped = \
                    df_con_split[lambda x: x['变量']==v].\
                    groupby(d)[['贡献']].sum().\
                    sort_values(by='贡献', ascending=False).reset_index().\
                    merge(df_curr_d[[d, self.weight]], on=[d]).rename(columns={self.weight: '群体权重'}).\
                    merge(df_base_d[[d, v]], on=[d]).rename(columns={v: '基期'}).\
                    merge(df_curr_d[[d, v]], on=[d]).rename(columns={v: '现期'})
                df_con_grouped['贡献权重'] = df_con_grouped['贡献']/df_con_grouped['贡献'].sum()
                df_con_grouped['群体权重'] = df_con_grouped['群体权重']/df_con_grouped['群体权重'].sum()
                df_con_grouped['重要度'] = np.abs(df_con_grouped['贡献权重'])/df_con_grouped['群体权重']
                # add threshold
                df_con_grouped['重要度'] = np.where(df_con_grouped['群体权重'] >= 0.05, df_con_grouped['重要度'], 0)
                display(
                    df_con_grouped[[d, '基期','现期','贡献','贡献权重','群体权重','重要度']].style.hide_index().\
                        background_gradient(
                            subset=pd.IndexSlice[:,['贡献权重','群体权重','重要度']],
                            # cmap='plasma',
                            cmap='viridis',
                    ).\
                        format({
                        '基期':'{:.2f}',
                        '现期':'{:.2f}',
                        '贡献':'{:.4f}',
                        '贡献权重':'{:.2%}',
                        '群体权重':'{:.2%}',
                        '重要度':'{:.4f}'
                    })
                )

## 分析过程

每项配置对应一个分析任务。配置说明：

- date_base 参照数据起止日期
- date_curr 当前数据起止日期
- dim 分析维度
- var 用于计算指标的变量
- metric 指标名称，可以随便取
- weight 权重变量，通常选取用户数或用户占比
- func 从 var 计算 metric 的 Python 函数

配置可自行增改。

In [109]:
## config


# d0 新增入群率
d0_group_to_dnu = {
    'date_base': ['2020-11-0', '2020-10-16'],
    'date_curr': ['2020-10-17', '2020-10-19'],
    'dim': ['os', 'role', 'channel'],
    'var': ['dnu_ratio', 'd0_camp_to_dnu', 'd0_group_to_camp'],
    'metric': 'd0_group_to_dnu',
    'weight': 'dnu_ratio',
    'func': lambda x: sum(x['dnu_ratio']*x['d0_group_to_camp']*x['d0_camp_to_dnu']),
}

# 群内转化
d3_order_per_group = {
    'date_base': ['2020-10-29', '2020-10-29'],
    'date_curr': ['2020-11-05', '2020-11-05'],
    'dim': ['os', 'occupation', 'channel', 'first_milestone_name', 'city_level', 'age'],
    'var': ['d3_group_ratio','d3_order_per_group'],
    'metric': 'd3_order_per_group',
    'weight': 'd3_group_ratio',
    'func': lambda x: sum(x['d3_group_ratio']*x['d3_order_per_group']),
}

# 每新增群内营收
d3_group_rev_per_dnu = {
    'date_base': ['2020-10-29', '2020-10-29'],
    'date_curr': ['2020-11-05', '2020-11-05'],
    'dim': ['os', 'occupation', 'channel', 'first_milestone_name'],
    'var': ['dnu_ratio', 'd3_group_to_dnu', 'd3_rev_per_group'],
    'metric': 'd3_group_rev_per_dnu',
    'weight': 'dnu_ratio',
    'func': lambda x: sum(x['dnu_ratio']*x['d3_group_to_dnu']*x['d3_rev_per_group']),
}

# 每新增总营收（含群外）
d6_rev_per_dnu = {
    'date_base': ['2020-10-08', '2020-10-14'],
    'date_curr': ['2020-10-15', '2020-10-21'],
    'dim': ['os', 'role', 'channel', 'age', 'ez_registered'],
    'var': ['dnu_ratio', 'd6_group_to_dnu', 'd6_rev_per_group','d6_off_group_rev_per_user'],
    'metric': 'd6_rev_per_dnu',
    'weight': 'dnu_ratio',
    'func': lambda x: sum(x['dnu_ratio']*x['d6_group_to_dnu']*x['d6_rev_per_group'] +
                          x['dnu_ratio']*(1-x['d6_group_to_dnu'])*x['d6_off_group_rev_per_user']),
}

In [110]:
# 顺序分析
for conf in [
    # 在此加入需要分析的配置
    # d0_camp_to_dnu,
    # d0_group_to_dnu,
    # d6_order_per_group,
    # d6_group_rev_per_dnu,
    # d3_group_rev_per_dnu,
    d3_order_per_group,
    
]:
    print("-"*100)
    s = ShapAnalysis(df_raw, **conf)
    s.process_data()
    s.analysis(sample_size=20)
    s.process_results()
    print(f"""从 {conf['date_base']} 到 {conf['date_curr']}\n指标 {conf['metric']} {s.metric_base: .4f} to {s.metric_curr: .4f} 变化: {s.metric_delta :.4f}""")
    s.display_contribution()

----------------------------------------------------------------------------------------------------
从 ['2020-10-29', '2020-10-29'] 到 ['2020-11-05', '2020-11-05']
指标 d3_order_per_group  0.0390 to  0.0198 变化: -0.0192
每个变量的整体贡献:


变量,贡献,贡献权重
d3_group_ratio,-0.0079,41.06%
d3_order_per_group,-0.0113,58.94%


每个变量贡献，按不同维度分解:

---------- 变量: os
-- 指标: d3_group_ratio. 累计贡献 -0.0079


os,基期,现期,贡献,贡献权重,群体权重,重要度
android,0.7,0.84,-0.0038,47.78%,83.83%,0.57
ios,0.3,0.16,-0.0041,52.22%,16.17%,3.2291


-- 指标: d3_order_per_group. 累计贡献 -0.0113


os,基期,现期,贡献,贡献权重,群体权重,重要度
android,0.03,0.02,0.0008,-6.79%,83.83%,0.081
ios,0.07,0.02,-0.0121,106.79%,16.17%,6.6037



---------- 变量: occupation
-- 指标: d3_group_ratio. 累计贡献 -0.0079


occupation,基期,现期,贡献,贡献权重,群体权重,重要度
unknown,0.01,0.0,0.0,-0.00%,0.50%,0.0
中小学生,0.0,0.0,0.0,-0.00%,0.00%,0.0
大学生,0.1,0.13,-0.0007,8.84%,12.71%,0.6954
老用户,0.06,0.05,-0.0018,22.33%,5.45%,4.0998
白领上班族,0.63,0.6,-0.0018,23.26%,60.40%,0.3852
自由职业者,0.2,0.21,-0.0036,45.57%,20.96%,2.1746


-- 指标: d3_order_per_group. 累计贡献 -0.0113


occupation,基期,现期,贡献,贡献权重,群体权重,重要度
unknown,0.0,0.0,0.0,-0.00%,0.50%,0.0
中小学生,0.0,0.0,0.0,-0.00%,0.00%,0.0
大学生,0.01,0.0,-0.0007,6.15%,12.71%,0.4844
老用户,0.09,0.03,-0.0022,19.10%,5.45%,3.508
白领上班族,0.03,0.02,-0.0033,29.42%,60.40%,0.4871
自由职业者,0.08,0.03,-0.0051,45.32%,20.96%,2.1627



---------- 变量: channel
-- 指标: d3_group_ratio. 累计贡献 -0.0079


channel,基期,现期,贡献,贡献权重,群体权重,重要度
Apple Store,0.06,0.04,0.0012,-14.95%,4.46%,0.0
华为小米,0.09,0.08,0.0009,-11.06%,7.76%,1.4255
Google-FB,0.0,0.0,0.0,-0.00%,0.00%,0.0
Vivo,0.05,0.06,0.0,-0.00%,6.44%,0.0
其他,0.02,0.01,0.0,-0.00%,0.83%,0.0
Oppo,0.02,0.01,-0.0006,7.07%,1.16%,0.0
广点通,0.05,0.05,-0.0008,10.60%,4.95%,0.0
内广,0.71,0.74,-0.0085,108.33%,74.42%,1.4556


-- 指标: d3_order_per_group. 累计贡献 -0.0113


channel,基期,现期,贡献,贡献权重,群体权重,重要度
华为小米,0.02,0.04,0.001,-9.16%,7.76%,1.1809
Google-FB,0.0,0.0,0.0,-0.00%,0.00%,0.0
Vivo,0.0,0.0,0.0,-0.00%,6.44%,0.0
其他,0.0,0.0,0.0,-0.00%,0.83%,0.0
广点通,0.03,0.0,-0.0006,4.92%,4.95%,0.0
Apple Store,0.04,0.07,-0.0007,5.86%,4.46%,0.0
Oppo,0.07,0.0,-0.0008,7.39%,1.16%,0.0
内广,0.05,0.02,-0.0103,90.99%,74.42%,1.2226



---------- 变量: first_milestone_name
-- 指标: d3_group_ratio. 累计贡献 -0.0079


first_milestone_name,基期,现期,贡献,贡献权重,群体权重,重要度
B1初,0.08,0.11,0.001,-12.08%,10.56%,1.1442
A2高,0.03,0.04,0.0005,-6.31%,4.29%,0.0
A2中,0.05,0.04,0.0003,-3.74%,4.13%,0.0
A2初,0.1,0.09,0.0002,-2.36%,9.41%,0.2513
B1高,0.05,0.04,-0.0002,2.00%,4.46%,0.0
A1高,0.06,0.06,-0.0008,10.60%,6.11%,1.7367
unknown,0.08,0.1,-0.0009,11.49%,9.57%,1.2002
A1初,0.14,0.15,-0.0022,28.35%,15.18%,1.8673
A0,0.28,0.27,-0.0028,35.38%,26.57%,1.3317
A1中,0.12,0.1,-0.0029,36.68%,9.74%,3.7675


-- 指标: d3_order_per_group. 累计贡献 -0.0113


first_milestone_name,基期,现期,贡献,贡献权重,群体权重,重要度
A1中,0.05,0.03,0.0006,-5.48%,9.74%,0.5628
A2中,0.06,0.08,0.0002,-1.94%,4.13%,0.0
unknown,0.02,0.0,-0.0005,4.31%,9.57%,0.4501
A1高,0.02,0.0,-0.0006,4.92%,6.11%,0.8064
A0,0.02,0.01,-0.0009,7.74%,26.57%,0.2912
B1高,0.05,0.04,-0.001,8.64%,4.46%,0.0
A2初,0.04,0.04,-0.0011,9.41%,9.41%,1.0
A2高,0.08,0.04,-0.0016,14.43%,4.29%,0.0
B1初,0.05,0.03,-0.0018,16.18%,10.56%,1.5317
A1初,0.05,0.0,-0.0047,41.80%,15.18%,2.7535



---------- 变量: city_level
-- 指标: d3_group_ratio. 累计贡献 -0.0079


city_level,基期,现期,贡献,贡献权重,群体权重,重要度
一线,0.08,0.06,0.0003,-3.76%,6.11%,0.6152
六线及以下,0.0,0.0,0.0,-0.00%,0.50%,0.0
特区,0.0,0.0,0.0,-0.00%,0.00%,0.0
三线,0.21,0.22,-0.0001,1.24%,22.11%,0.056
新一线,0.23,0.21,-0.0002,3.05%,21.45%,0.1422
五线,0.1,0.12,-0.0009,11.70%,11.72%,0.9988
二线,0.18,0.2,-0.0016,20.86%,20.30%,1.028
unknown,0.02,0.02,-0.0019,23.76%,2.15%,0.0
四线,0.18,0.16,-0.0034,43.14%,15.68%,2.7518


-- 指标: d3_order_per_group. 累计贡献 -0.0113


city_level,基期,现期,贡献,贡献权重,群体权重,重要度
六线及以下,0.0,0.0,0.0,-0.00%,0.50%,0.0
特区,0.0,0.0,0.0,-0.00%,0.00%,0.0
新一线,0.03,0.03,-0.0001,1.08%,21.45%,0.0505
四线,0.03,0.01,-0.0005,4.61%,15.68%,0.2939
三线,0.02,0.01,-0.0008,6.90%,22.11%,0.3119
五线,0.04,0.01,-0.0016,14.19%,11.72%,1.2114
unknown,0.21,0.0,-0.0023,20.38%,2.15%,0.0
一线,0.06,0.03,-0.0028,24.96%,6.11%,4.0881
二线,0.05,0.02,-0.0032,27.88%,20.30%,1.3736



---------- 变量: age
-- 指标: d3_group_ratio. 累计贡献 -0.0079


age,基期,现期,贡献,贡献权重,群体权重,重要度
24-30,0.18,0.33,0.001,-12.97%,32.84%,0.3948
18-23,0.04,0.11,0.0002,-3.01%,11.22%,0.268
-18,0.0,0.0,0.0,-0.00%,0.00%,0.0
50-,0.0,0.0,0.0,-0.00%,0.00%,0.0
unknown,0.17,0.13,-0.002,25.18%,13.04%,1.9313
41-50,0.04,0.03,-0.0022,27.39%,3.30%,0.0
31-40,0.56,0.4,-0.005,63.40%,39.60%,1.6009


-- 指标: d3_order_per_group. 累计贡献 -0.0113


age,基期,现期,贡献,贡献权重,群体权重,重要度
18-23,0.03,0.01,0.0,-0.18%,11.22%,0.0161
24-30,0.03,0.02,0.0,-0.07%,32.84%,0.0021
-18,0.0,0.0,0.0,-0.00%,0.00%,0.0
50-,0.0,0.0,0.0,-0.00%,0.00%,0.0
unknown,0.04,0.03,-0.0017,14.84%,13.04%,1.1386
41-50,0.1,0.0,-0.002,17.85%,3.30%,0.0
31-40,0.04,0.02,-0.0076,67.56%,39.60%,1.7059
