In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
import sys
from datetime import date, timedelta

import bytedtqs
from pytqs import tqs
#import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
# from statsmodels.stats.proportion import proportions_ztest
# from statsmodels.stats.proportion import proportions_chisquare
from IPython.display import display, HTML

import logging
logging.basicConfig(level=logging.CRITICAL)
# logging.disable(sys.maxsize)

In [2]:
def two_proprotions_test(success_a, size_a, success_b, size_b):
    prop_a = success_a / size_a
    prop_b = success_b / size_b
    prop_pooled = (success_a + success_b) / (size_a + size_b)
    var = prop_pooled * (1 - prop_pooled) * (1 / size_a + 1 / size_b)
    zscore = np.abs(prop_b - prop_a) / np.sqrt(var)
    one_side = 1 - stats.norm(loc = 0, scale = 1).cdf(zscore)
    pvalue = one_side * 2
    return zscore, pvalue

In [3]:
df = pd.read_csv('/home/tiger/archived-data/aeolus-data/20210122/16/36709877-课程实验-等级测试算法接管-查询39.csv', encoding = 'gb18030')

In [4]:
df.columns

Index(['abtest_dt', 'user_group', 'user_id', 'source', 'enter_camp_user',
       'enter_group_user', 'd6_vip_user', 'start_user', 'end_user',
       'new_end_user', 'mile_stone_name', 'labels', 'ttl_exe',
       'd0_finish_lessons', 'd0_study_duration', 'd1_study_duration',
       'd2_study_duration', 'd3_study_duration', 'd4_study_duration',
       'd5_study_duration', 'd6_study_duration'],
      dtype='object')

In [5]:
df.groupby(['mile_stone_name']).size()

mile_stone_name
A0     12781
A1中     6060
A1初     6769
A1高     4825
A2中     4320
A2初     6003
A2高     3440
B1中     2915
B1初     4719
B1高     2383
B2      8073
C1        36
C2         3
dtype: int64

In [51]:
# df.groupby(['mile_stone_name', 'user_group']).size()

In [52]:
# df_data = df[(df.mile_stone_name.isna()) & (df.source.isna())]
# df_data.head()

In [53]:
# df[df.end_user == 1].groupby(['mile_stone_name', 'user_group']).size()

In [54]:
### 预处理数据

#1. 填充空值数据
df['mile_stone_name'] = df['mile_stone_name'].fillna('未完成定级测试')
df = df.fillna(0)

#2. 增加新字段


df.loc[(df['d0_study_duration'] > 600), 'd0_study_10min'] = 1
df.loc[(df['d0_study_duration'] < 600), 'd0_study_10min'] = 0

df.loc[(df['d1_study_duration'] > 600), 'd1_study_10min'] = 1
df.loc[(df['d1_study_duration'] < 600), 'd1_study_10min'] = 0

df.loc[(df['d2_study_duration'] > 600), 'd2_study_10min'] = 1
df.loc[(df['d2_study_duration'] < 600), 'd2_study_10min'] = 0

df.loc[(df['d3_study_duration'] > 600), 'd3_study_10min'] = 1
df.loc[(df['d3_study_duration'] < 600), 'd3_study_10min'] = 0

df.loc[(df['d4_study_duration'] > 600), 'd4_study_10min'] = 1
df.loc[(df['d4_study_duration'] < 600), 'd4_study_10min'] = 0

# 3. d6数据
# df = df.loc[df.abtest_dt <= (date.today() - timedelta(days=6)).strftime('%Y-%m-%d')]

#### 全量用户 

In [55]:
def ab_summary_data(data, group_name):
    test_summary = pd.DataFrame(columns = ["User_Group", "Metrics", "Control", "Test", "Delta", "p_value","Significance"])
    
    test_data = data.loc[data.user_group == '实验组']
    ctrl_data = data.loc[data.user_group == '对照组']

    dnu_t = len(test_data)
    dnu_c = len(ctrl_data)

    # 新增用户
    test_summary = test_summary.append({"Metrics": '总新增用户', "Control": '{:,}'.format(dnu_c), "Test": '{:,}'.format(dnu_t), "Delta": '-', "p_value": '-', "Significance": '-'}, ignore_index = True)
 
    # d0测试完成率
    d0_test_fin_t = sum(test_data.end_user)
    d0_test_fin_c = sum(ctrl_data.end_user)
    
    d0_test_fin_rate_t = round(d0_test_fin_t / dnu_t, 4)
    d0_test_fin_rate_c = round(d0_test_fin_c / dnu_c, 4)
    
    d0_test_fin_rate_delta = d0_test_fin_rate_t / d0_test_fin_rate_c - 1 if d0_test_fin_rate_c > 0 else np.nan
    d0_test_fin_rate_p_value = two_proprotions_test(d0_test_fin_t, dnu_t, d0_test_fin_c, dnu_c)[1]
    d0_test_fin_rate_sig =  "Significant" if d0_test_fin_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd0测试完成率', "Control": "{:.2%}".format(d0_test_fin_rate_c), "Test": "{:.2%}".format(d0_test_fin_rate_t), "Delta": "{:.2%}".format(d0_test_fin_rate_delta), "p_value": round(d0_test_fin_rate_p_value, 4), "Significance": d0_test_fin_rate_sig}, ignore_index = True)

    # d0测试完成率(+部分完成)
    d0_test_fin_new_t = sum(test_data.end_user) + sum(test_data.new_end_user)
    d0_test_fin_new_c = sum(ctrl_data.end_user) + sum(ctrl_data.new_end_user)
    
    d0_test_fin_new_rate_t = round(d0_test_fin_new_t / dnu_t, 4)
    d0_test_fin_new_rate_c = round(d0_test_fin_new_c / dnu_c, 4)
    
    d0_test_fin_new_rate_delta = d0_test_fin_new_rate_t / d0_test_fin_new_rate_c - 1 if d0_test_fin_new_rate_c > 0 else np.nan
    d0_test_fin_new_rate_p_value = two_proprotions_test(d0_test_fin_new_t, dnu_t, d0_test_fin_new_c, dnu_c)[1]
    d0_test_fin_new_rate_sig =  "Significant" if d0_test_fin_new_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd0测试完成率(+部分完成)', "Control": "{:.2%}".format(d0_test_fin_new_rate_c), "Test": "{:.2%}".format(d0_test_fin_new_rate_t), "Delta": "{:.2%}".format(d0_test_fin_new_rate_delta), "p_value": round(d0_test_fin_new_rate_p_value, 4), "Significance": d0_test_fin_new_rate_sig}, ignore_index = True)

    # d0报名率
    d0_camp_t = sum(test_data.enter_camp_user)
    d0_camp_c = sum(ctrl_data.enter_camp_user)
    
    d0_camp_rate_t = round(d0_camp_t / dnu_t, 4)
    d0_camp_rate_c = round(d0_camp_c / dnu_c, 4)
    
    d0_camp_rate_delta = d0_camp_rate_t / d0_camp_rate_c - 1 if d0_camp_rate_c > 0 else np.nan
    d0_camp_rate_p_value = two_proprotions_test(d0_camp_t, dnu_t, d0_camp_c, dnu_c)[1]
    d0_camp_rate_sig =  "Significant" if d0_camp_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd0新增到报名', "Control": "{:.2%}".format(d0_camp_rate_c), "Test": "{:.2%}".format(d0_camp_rate_t), "Delta": "{:.2%}".format(d0_camp_rate_delta), "p_value": round(d0_camp_rate_p_value, 4), "Significance": d0_camp_rate_sig}, ignore_index = True)

    # d0入群率
    d0_group_t = sum(test_data.enter_group_user)
    d0_group_c = sum(ctrl_data.enter_group_user)
    
    d0_group_rate_t = round(d0_group_t / dnu_t, 4)
    d0_group_rate_c = round(d0_group_c / dnu_c, 4)
    
    d0_group_rate_delta = d0_group_rate_t / d0_group_rate_c - 1 if d0_group_rate_c > 0 else np.nan
    d0_group_rate_p_value = two_proprotions_test(d0_group_t, dnu_t, d0_group_c, dnu_c)[1]
    d0_group_rate_sig =  "Significant" if d0_group_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd0新增到入群', "Control": "{:.2%}".format(d0_group_rate_c), "Test": "{:.2%}".format(d0_group_rate_t), "Delta": "{:.2%}".format(d0_group_rate_delta), "p_value": round(d0_group_rate_p_value, 4), "Significance": d0_group_rate_sig}, ignore_index = True)

    
    # d0学习率(>=10m)
    d0_study_10m_t = test_data.d0_study_10min.sum()
    d0_study_10m_c = ctrl_data.d0_study_10min.sum()
    
    d0_st_10m_rate_t = round(d0_study_10m_t / dnu_t, 4)
    d0_st_10m_rate_c = round(d0_study_10m_c / dnu_c, 4)
    
    d0_st_10m_rate_delta = d0_st_10m_rate_t / d0_st_10m_rate_c - 1 if d0_st_10m_rate_c > 0 else np.nan
    d0_st_10m_rate_p_value = two_proprotions_test(d0_study_10m_t, dnu_t, d0_study_10m_c, dnu_c)[1]
    d0_st_10m_rate_sig =  "Significant" if d0_st_10m_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd0有效学习率(>10m)', "Control": "{:.2%}".format(d0_st_10m_rate_c), "Test": "{:.2%}".format(d0_st_10m_rate_t), "Delta": "{:.2%}".format(d0_st_10m_rate_delta), "p_value": round(d0_st_10m_rate_p_value, 4), "Significance": d0_st_10m_rate_sig}, ignore_index = True)

    # d1学习率(>=10m)
    test_data_tmp = data.loc[(data.user_group == '实验组') & (data.abtest_dt <= (date.today() - timedelta(days=2)).strftime('%Y-%m-%d'))]
    ctrl_data_tmp = data.loc[(data.user_group == '对照组') & (data.abtest_dt <= (date.today() - timedelta(days=2)).strftime('%Y-%m-%d'))]

    dnu_t_tmp = len(test_data_tmp)
    dnu_c_tmp = len(ctrl_data_tmp)
    
    d1_study_10m_t = test_data_tmp.d1_study_10min.sum()
    d1_study_10m_c = ctrl_data_tmp.d1_study_10min.sum()
    
    d1_st_10m_rate_t = round(d1_study_10m_t / dnu_t_tmp, 4)
    d1_st_10m_rate_c = round(d1_study_10m_c / dnu_c_tmp, 4)
    
    d1_st_10m_rate_delta = d1_st_10m_rate_t / d1_st_10m_rate_c - 1 if d1_st_10m_rate_c > 0 else np.nan
    d1_st_10m_rate_p_value = two_proprotions_test(d1_study_10m_t, dnu_t_tmp, d1_study_10m_c, dnu_c_tmp)[1]
    d1_st_10m_rate_sig =  "Significant" if d1_st_10m_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd1有效学习率(>10m)', "Control": "{:.2%}".format(d1_st_10m_rate_c), "Test": "{:.2%}".format(d1_st_10m_rate_t), "Delta": "{:.2%}".format(d1_st_10m_rate_delta), "p_value": round(d1_st_10m_rate_p_value, 4), "Significance": d1_st_10m_rate_sig}, ignore_index = True)

    # d2学习率(>=10m)
    test_data_tmp = data.loc[(data.user_group == '实验组') & (data.abtest_dt <= (date.today() - timedelta(days=3)).strftime('%Y-%m-%d'))]
    ctrl_data_tmp = data.loc[(data.user_group == '对照组') & (data.abtest_dt <= (date.today() - timedelta(days=3)).strftime('%Y-%m-%d'))]

    dnu_t_tmp = len(test_data_tmp)
    dnu_c_tmp = len(ctrl_data_tmp)
    
    d2_study_10m_t = test_data_tmp.d2_study_10min.sum()
    d2_study_10m_c = ctrl_data_tmp.d2_study_10min.sum()
    
    d2_st_10m_rate_t = round(d2_study_10m_t / dnu_t_tmp, 4)
    d2_st_10m_rate_c = round(d2_study_10m_c / dnu_c_tmp, 4)
    
    d2_st_10m_rate_delta = d2_st_10m_rate_t / d2_st_10m_rate_c - 1 if d2_st_10m_rate_c > 0 else np.nan
    d2_st_10m_rate_p_value = two_proprotions_test(d2_study_10m_t, dnu_t_tmp, d2_study_10m_c, dnu_c_tmp)[1]
    d2_st_10m_rate_sig =  "Significant" if d2_st_10m_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd2有效学习率(>10m)', "Control": "{:.2%}".format(d2_st_10m_rate_c), "Test": "{:.2%}".format(d2_st_10m_rate_t), "Delta": "{:.2%}".format(d2_st_10m_rate_delta), "p_value": round(d2_st_10m_rate_p_value, 4), "Significance": d2_st_10m_rate_sig}, ignore_index = True)

    # d3学习率(>=10m)
    test_data_tmp = data.loc[(data.user_group == '实验组') & (data.abtest_dt <= (date.today() - timedelta(days=4)).strftime('%Y-%m-%d'))]
    ctrl_data_tmp = data.loc[(data.user_group == '对照组') & (data.abtest_dt <= (date.today() - timedelta(days=4)).strftime('%Y-%m-%d'))]

    dnu_t_tmp = len(test_data_tmp)
    dnu_c_tmp = len(ctrl_data_tmp)
    
    d3_study_10m_t = test_data_tmp.d3_study_10min.sum()
    d3_study_10m_c = ctrl_data_tmp.d3_study_10min.sum()
    
    d3_st_10m_rate_t = round(d3_study_10m_t / dnu_t_tmp, 4)
    d3_st_10m_rate_c = round(d3_study_10m_c / dnu_c_tmp, 4)
    
    d3_st_10m_rate_delta = d3_st_10m_rate_t / d3_st_10m_rate_c - 1 if d3_st_10m_rate_c > 0 else np.nan
    d3_st_10m_rate_p_value = two_proprotions_test(d3_study_10m_t, dnu_t_tmp, d3_study_10m_c, dnu_c_tmp)[1]
    d3_st_10m_rate_sig =  "Significant" if d3_st_10m_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd3有效学习率(>10m)', "Control": "{:.2%}".format(d3_st_10m_rate_c), "Test": "{:.2%}".format(d3_st_10m_rate_t), "Delta": "{:.2%}".format(d3_st_10m_rate_delta), "p_value": round(d3_st_10m_rate_p_value, 4), "Significance": d3_st_10m_rate_sig}, ignore_index = True)

    # d4学习率(>=10m)
    test_data_tmp = data.loc[(data.user_group == '实验组') & (data.abtest_dt <= (date.today() - timedelta(days=5)).strftime('%Y-%m-%d'))]
    ctrl_data_tmp = data.loc[(data.user_group == '对照组') & (data.abtest_dt <= (date.today() - timedelta(days=5)).strftime('%Y-%m-%d'))]

    dnu_t_tmp = len(test_data_tmp)
    dnu_c_tmp = len(ctrl_data_tmp)
    
    d4_study_10m_t = test_data_tmp.d4_study_10min.sum()
    d4_study_10m_c = ctrl_data_tmp.d4_study_10min.sum()
    
    d4_st_10m_rate_t = round(d4_study_10m_t / dnu_t_tmp, 4)
    d4_st_10m_rate_c = round(d4_study_10m_c / dnu_c_tmp, 4)
    
    d4_st_10m_rate_delta = d4_st_10m_rate_t / d4_st_10m_rate_c - 1 if d4_st_10m_rate_c > 0 else np.nan
    d4_st_10m_rate_p_value = two_proprotions_test(d4_study_10m_t, dnu_t_tmp, d4_study_10m_c, dnu_c_tmp)[1]
    d4_st_10m_rate_sig =  "Significant" if d4_st_10m_rate_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd4有效学习率(>10m)', "Control": "{:.2%}".format(d4_st_10m_rate_c), "Test": "{:.2%}".format(d4_st_10m_rate_t), "Delta": "{:.2%}".format(d4_st_10m_rate_delta), "p_value": round(d4_st_10m_rate_p_value, 4), "Significance": d4_st_10m_rate_sig}, ignore_index = True)

     # d6转化
    test_data_tmp = data.loc[(data.user_group == '实验组') & (data.abtest_dt <= (date.today() - timedelta(days=7)).strftime('%Y-%m-%d'))]
    ctrl_data_tmp = data.loc[(data.user_group == '对照组') & (data.abtest_dt <= (date.today() - timedelta(days=7)).strftime('%Y-%m-%d'))]
    dnu_t_tmp = len(test_data_tmp)
    dnu_c_tmp = len(ctrl_data_tmp)
    
    vip_t = test_data_tmp.d6_vip_user.sum()
    vip_c = ctrl_data_tmp.d6_vip_user.sum()
    
    d6_cr_t = round(vip_t / dnu_t_tmp, 4)
    d6_cr_c = round(vip_c / dnu_c_tmp, 4)
    d6_cr_delta = d6_cr_t / d6_cr_c - 1 if d6_cr_c > 0 else np.nan
    d6_cr_p_value = two_proprotions_test(vip_t, dnu_t_tmp, vip_c, dnu_c_tmp)[1]
    d6_cr_sig =  "Significant" if d6_cr_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd6新增到转化', "Control": "{:.2%}".format(d6_cr_c), "Test": "{:.2%}".format(d6_cr_t), "Delta": "{:.2%}".format(d6_cr_delta), "p_value": round(d6_cr_p_value, 4), "Significance": d6_cr_sig}, ignore_index = True)
    
    # d6群内转化
    test_data_tmp = data.loc[(data.user_group == '实验组') & (data.enter_group_user == 1) & (data.abtest_dt <= (date.today() - timedelta(days=7)).strftime('%Y-%m-%d'))]
    ctrl_data_tmp = data.loc[(data.user_group == '对照组') & (data.enter_group_user == 1) & (data.abtest_dt <= (date.today() - timedelta(days=7)).strftime('%Y-%m-%d'))]
    
    dnu_t_tmp = len(test_data_tmp)
    dnu_c_tmp = len(ctrl_data_tmp)
    
    vip_t = test_data_tmp.d6_vip_user.sum()
    vip_c = ctrl_data_tmp.d6_vip_user.sum()
    
    d6_cr_t = round(vip_t / dnu_t_tmp, 4)
    d6_cr_c = round(vip_c / dnu_c_tmp, 4)
    d6_cr_delta = d6_cr_t / d6_cr_c - 1 if d6_cr_c > 0 else np.nan
    d6_cr_p_value = two_proprotions_test(vip_t, dnu_t_tmp, vip_c, dnu_c_tmp)[1]
    d6_cr_sig =  "Significant" if d6_cr_p_value < 0.1 else "Non-significant"
    test_summary = test_summary.append({"Metrics": 'd6群内转化', "Control": "{:.2%}".format(d6_cr_c), "Test": "{:.2%}".format(d6_cr_t), "Delta": "{:.2%}".format(d6_cr_delta), "p_value": round(d6_cr_p_value, 4), "Significance": d6_cr_sig}, ignore_index = True)
    
    test_summary['User_Group'] = group_name
    
    return test_summary

In [56]:
output = ab_summary_data(df, '全量用户')
output.style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()

User_Group,Metrics,Control,Test,Delta,p_value,Significance
全量用户,总新增用户,61699,61516,-,-,-
全量用户,d0测试完成率,46.07%,46.25%,0.39%,0.513000,Non-significant
全量用户,d0测试完成率(+部分完成),50.09%,50.08%,-0.02%,0.975700,Non-significant
全量用户,d0新增到报名,33.37%,33.55%,0.54%,0.505700,Non-significant
全量用户,d0新增到入群,14.39%,14.64%,1.74%,0.205400,Non-significant
全量用户,d0有效学习率(>10m),21.25%,21.33%,0.38%,0.749100,Non-significant
全量用户,d1有效学习率(>10m),13.61%,13.98%,2.72%,0.066800,Significant
全量用户,d2有效学习率(>10m),10.45%,10.80%,3.35%,0.059100,Significant
全量用户,d3有效学习率(>10m),8.05%,8.30%,3.11%,0.146200,Non-significant
全量用户,d4有效学习率(>10m),5.80%,6.16%,6.21%,0.019500,Significant


#### 完测用户

In [57]:
group_list = ['A1初', 'A1中', 'A1高', 'A2初', 'A2中', 'A2高', 'B1初', 'B1中', 'B1高', 'B2', 'C1', 'C2']

df_sub = df[(df.mile_stone_name.isin(group_list)) ]
output = ab_summary_data(df_sub, '完测用户')
output.style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()

User_Group,Metrics,Control,Test,Delta,p_value,Significance
完测用户,总新增用户,24600,24946,-,-,-
完测用户,d0测试完成率,93.21%,93.55%,0.36%,0.125000,Non-significant
完测用户,d0测试完成率(+部分完成),98.88%,98.89%,0.01%,0.936700,Non-significant
完测用户,d0新增到报名,54.84%,54.77%,-0.13%,0.880600,Non-significant
完测用户,d0新增到入群,27.26%,27.29%,0.11%,0.938800,Non-significant
完测用户,d0有效学习率(>10m),35.60%,36.10%,1.40%,0.245600,Non-significant
完测用户,d1有效学习率(>10m),23.76%,24.35%,2.48%,0.139100,Non-significant
完测用户,d2有效学习率(>10m),18.41%,19.12%,3.86%,0.056600,Significant
完测用户,d3有效学习率(>10m),14.29%,14.65%,2.52%,0.301600,Non-significant
完测用户,d4有效学习率(>10m),10.15%,10.71%,5.52%,0.068700,Significant


In [58]:
group_list = ['B2', 'C1', 'C2']

df_sub = df[(df.mile_stone_name.isin(group_list)) ]
output = ab_summary_data(df_sub, 'B2+')
output.style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()

User_Group,Metrics,Control,Test,Delta,p_value,Significance
B2+,总新增用户,4181,3931,-,-,-
B2+,d0测试完成率,94.67%,94.66%,-0.01%,0.986400,Non-significant
B2+,d0测试完成率(+部分完成),98.11%,98.40%,0.30%,0.324800,Non-significant
B2+,d0新增到报名,68.55%,67.13%,-2.07%,0.172600,Non-significant
B2+,d0新增到入群,38.53%,37.12%,-3.66%,0.188700,Non-significant
B2+,d0有效学习率(>10m),43.31%,43.96%,1.50%,0.559300,Non-significant
B2+,d1有效学习率(>10m),31.98%,33.22%,3.88%,0.250400,Non-significant
B2+,d2有效学习率(>10m),26.63%,27.02%,1.46%,0.705100,Non-significant
B2+,d3有效学习率(>10m),20.56%,21.60%,5.06%,0.292700,Non-significant
B2+,d4有效学习率(>10m),15.37%,16.11%,4.81%,0.421000,Non-significant


#### 分等级

In [59]:
group_list = ['A1初', 'A1中', 'A1高', 'A2初', 'A2中', 'A2高', 'B1初', 'B1中', 'B1高', 'B2']
output = pd.DataFrame(columns = ["User_Group", "Metrics", "Control", "Test", "Delta", "p_value","Significance"])

for i in range(0,len(group_list)):
    user_group = group_list[i]
    group_data = df[df.mile_stone_name == user_group]
    output = output.append(ab_summary_data(group_data, user_group))

output.reset_index(drop=True).style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()

User_Group,Metrics,Control,Test,Delta,p_value,Significance
A1初,总新增用户,3772,2997,-,-,-
A1初,d0测试完成率,90.85%,90.42%,-0.47%,0.545900,Non-significant
A1初,d0测试完成率(+部分完成),99.60%,99.13%,-0.47%,0.013300,Significant
A1初,d0新增到报名,39.37%,39.74%,0.94%,0.756700,Non-significant
A1初,d0新增到入群,16.33%,16.42%,0.55%,0.924700,Non-significant
A1初,d0有效学习率(>10m),29.80%,31.46%,5.57%,0.139300,Non-significant
A1初,d1有效学习率(>10m),15.69%,16.62%,5.93%,0.317100,Non-significant
A1初,d2有效学习率(>10m),10.60%,11.54%,8.87%,0.246400,Non-significant
A1初,d3有效学习率(>10m),7.47%,7.99%,6.96%,0.468200,Non-significant
A1初,d4有效学习率(>10m),5.46%,5.79%,6.04%,0.596500,Non-significant


#### 未完测用户等级来源

In [60]:
df_sub = df[(df.source.isin(['ezo_default_ailab_test', 'ezo_default_damodel'])) & (df.ttl_exe >=6)]
output = ab_summary_data(df_sub, '未完测用户 - 答题数 >=6')
output.style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()

User_Group,Metrics,Control,Test,Delta,p_value,Significance
未完测用户 - 答题数 >=6,总新增用户,9189,9642,-,-,-
未完测用户 - 答题数 >=6,d0测试完成率,2.61%,3.43%,31.42%,0.001000,Significant
未完测用户 - 答题数 >=6,d0测试完成率(+部分完成),2.70%,3.53%,30.74%,0.001100,Significant
未完测用户 - 答题数 >=6,d0新增到报名,23.42%,24.95%,6.53%,0.014000,Significant
未完测用户 - 答题数 >=6,d0新增到入群,5.89%,6.44%,9.34%,0.114900,Non-significant
未完测用户 - 答题数 >=6,d0有效学习率(>10m),10.99%,11.50%,4.64%,0.268000,Non-significant
未完测用户 - 答题数 >=6,d1有效学习率(>10m),6.78%,6.97%,2.80%,0.616900,Non-significant
未完测用户 - 答题数 >=6,d2有效学习率(>10m),4.93%,5.31%,7.71%,0.267400,Non-significant
未完测用户 - 答题数 >=6,d3有效学习率(>10m),3.90%,4.00%,2.56%,0.747500,Non-significant
未完测用户 - 答题数 >=6,d4有效学习率(>10m),2.78%,3.29%,18.35%,0.087300,Significant


In [65]:
df_sub = df[(df.source.isin(['ezo_default_ailab_test', 'ezo_default_less_trtest'])) & (df.ttl_exe < 6) & (df.ttl_exe > 0)]
output = ab_summary_data(df_sub, '未完测用户-答题数<6')
output.style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()

User_Group,Metrics,Control,Test,Delta,p_value,Significance
未完测用户-答题数<6,总新增用户,3437,2932,-,-,-
未完测用户-答题数<6,d0测试完成率,0.55%,0.00%,-100.00%,0.000100,Significant
未完测用户-答题数<6,d0测试完成率(+部分完成),0.58%,0.00%,-100.00%,0.000000,Significant
未完测用户-答题数<6,d0新增到报名,16.21%,13.74%,-15.24%,0.006200,Significant
未完测用户-答题数<6,d0新增到入群,2.53%,3.10%,22.53%,0.167200,Non-significant
未完测用户-答题数<6,d0有效学习率(>10m),6.37%,7.30%,14.60%,0.143000,Non-significant
未完测用户-答题数<6,d1有效学习率(>10m),2.87%,4.43%,54.36%,0.001300,Significant
未完测用户-答题数<6,d2有效学习率(>10m),2.15%,3.19%,48.37%,0.017200,Significant
未完测用户-答题数<6,d3有效学习率(>10m),1.41%,2.58%,82.98%,0.002600,Significant
未完测用户-答题数<6,d4有效学习率(>10m),0.87%,1.96%,125.29%,0.001500,Significant


In [63]:
df_sub.groupby(['source', 'ttl_exe']).size()

source                   ttl_exe
ezo_default_ailab_test   0.0         285
                         1.0        1054
                         2.0         525
                         3.0         458
                         4.0         445
                         5.0         446
ezo_default_less_trtest  0.0        2513
                         1.0         687
                         2.0         662
                         3.0         488
                         4.0        1184
                         5.0         420
dtype: int64

In [None]:
#### 分跳出的位置

In [47]:
# group_list = ['其他', '词汇', '语法', '听力', '口语']
# output = pd.DataFrame(columns = ["User_Group", "Metrics", "Control", "Test", "Delta", "p_value","Significance"])

# for i in range(0,len(group_list)):
#     user_group = group_list[i]
#     group_data = df[df.labels == user_group]
#     output = output.append(ab_summary_data(group_data, user_group))

# output.reset_index(drop=True).style.apply(lambda x: ["background: green" if v == 'Significant' else "" for v in x], axis = 1).hide_index()