In [1]:
import pandas as pd 
import numpy as np 

In [None]:
"""
t1_user_active_min.csv
This table contains active minutes data logged after experiment started.
Each row represents the total number of minutes spent on site for each user on a date.
If a user never visited the site for a given date, there wouldn't be data for that uid on that date.
- uid: user ID
- dt: date when corresponding active minutes are registered
- active_mins: number of minutes spent on site for the date

t2_user_variant.csv
This table contains users’ treatment assignment.
Each row represents the assignment information for a unique user.
- uid: user ID
- variant_number: the experiment variant user is in. 0 for control, 1 for treatment
- dt: date when user entered the experiment, should be ‘2019-02-06’ for all users
- signup_date: the date string that user signed up on

t3_user_active_min_pre.csv
This table contains active minutes data before the experiment started.
It has a similar format as t1, except the dt range can extend before the experiment start date.
- uid: user ID
- dt: date when corresponding active minutes are registered
- active_mins: number of minutes spent on site for the date

t4_user_attributes.csv
This table contains data about some user attributes.
Each row represents attributes of a unique user.
- uid: user ID
- user_type: segment that a user belongs to, measured by activity level of the user. Can be ‘new_user’, ‘non_reader’, ‘reader’ or ‘contributor’
- gender: user gender. Can be ‘male’, ‘female’ or ‘unknown’
"""


In [2]:
df1 = pd.read_csv('t1_user_active_min.csv')

In [5]:
df1.head(2)   

Unnamed: 0,uid,dt,active_mins
0,0,2019-02-22,5.0
1,0,2019-03-11,5.0


In [3]:
df2 = pd.read_csv('t2_user_variant.csv')

In [6]:
df2.head(2)  

Unnamed: 0,uid,variant_number,dt,signup_date
0,0,0,2019-02-06,2018-09-24
1,1,0,2019-02-06,2016-11-07


In [4]:
df3 = pd.read_csv('t3_user_active_min_pre.csv')

In [8]:
df3.head(2) 

Unnamed: 0,uid,dt,active_mins
0,0,2018-09-24,3.0
1,0,2018-11-08,4.0


In [5]:
df4 = pd.read_csv('t4_user_attributes.csv')

In [10]:
df4.head(2) 

Unnamed: 0,uid,gender,user_type
0,0,male,non_reader
1,1,male,reader


# 使用table1 & table2 

In [6]:
from scipy import stats 

In [7]:
#按照uid dt 合并数据  选择variant_number=1 即treatment下的active_mins 先删除异常值 后做正太检验  最后计算95% 的置信区间
total_two = pd.merge(df1,df2,left_on = ['uid','dt'],right_on = ['uid','dt'])

In [13]:
total_two.head(3) 

Unnamed: 0,uid,dt,active_mins,variant_number,signup_date
0,17,2019-02-06,157.0,0,2018-06-10
1,18,2019-02-06,1.0,0,2013-06-19
2,38,2019-02-06,26.0,0,2018-10-24


In [8]:
non_treatment_mins = total_two.loc[total_two['variant_number']==0]['active_mins']

In [10]:
treatment_mins = total_two.loc[total_two['variant_number']==1]['active_mins']

In [39]:
non_treatment_mins.describe()

count    7234.000000
mean       19.893420
std        46.827376
min         1.000000
25%         2.000000
50%         4.000000
75%        15.000000
max       894.000000
Name: active_mins, dtype: float64

In [40]:
#剔除离群点 variant_number == 0
iqr_0 = 15 - 2 
#print((2 - 1.*iqr_0),(15+1.5*iqr_0))
non_treatment_mins_non_outlier = non_treatment_mins.loc[(non_treatment_mins<(15+1.5*iqr_0))&(non_treatment_mins>(2 - 1.*iqr_0))]
u,std = non_treatment_mins_non_outlier.mean() , non_treatment_mins_non_outlier.std()
print(u,std) 

6.723800353073343 7.605118454744637


In [38]:
treatment_mins.describe() 

count    1441.000000
mean       28.474670
std        63.661809
min         1.000000
25%         3.000000
50%         8.000000
75%        22.000000
max       669.000000
Name: active_mins, dtype: float64

In [41]:
#剔除离群点 variant_number == 1
iqr = 22 - 3 
#print((3 - 1.*iqr),(22+1.5*iqr))
treatment_mins_non_outlier = treatment_mins.loc[(treatment_mins<(22+1.5*iqr))&(treatment_mins>(3 - 1.*iqr))]
u1,std1 = treatment_mins_non_outlier.mean() , treatment_mins_non_outlier.std()
print(u1,std1) 

10.506805444355484 10.956140421867927


In [59]:
stats.norm.interval(0.95,u1,std1) # 第一问 treatment 95%置信区间计算 计算结果保留两位小数

(-10.966835192069128, 31.980446080780098)

In [44]:
stats.ttest_rel(non_treatment_mins_non_outlier[:1249],treatment_mins_non_outlier) #control, treatment存在显著性差异

Ttest_relResult(statistic=-10.472262972751212, pvalue=1.1772868452221205e-24)

# 使用table3 & table1 & table2

In [97]:
#f找出 variant_number = 1 的uid   找出 treatment之前之后的active_mins 计算显著性差异
treatment_uid = df2[df2['variant_number'] == 1]['uid'] 
mins_after = df1.loc[df1['uid'].isin(treatment_uid)]['active_mins']

mins_before = df3.loc[df3['uid'].isin(treatment_uid)]['active_mins']

In [101]:
mins_before.describe()  

count    200604.000000
mean         16.078957
std         500.199145
min           1.000000
25%           2.000000
50%           4.000000
75%          11.000000
max       99999.000000
Name: active_mins, dtype: float64

In [103]:
mins_after.describe()  

count    179445.000000
mean         40.240408
std        1293.703072
min           1.000000
25%           3.000000
50%           7.000000
75%          19.000000
max       99999.000000
Name: active_mins, dtype: float64

In [102]:
#before treatment  95%置信区间
iqr2 = 11-2 
mins_before_nonoutlier = mins_before.loc[(mins_before<(11+1.5*iqr2))&(mins_before>(2-1.5*iqr2))]

u2,std2 = mins_before_nonoutlier.mean() , mins_before_nonoutlier.std()
stats.norm.interval(0.95,u2,std2) # 95%置信区间计算 计算结果保留两位小数

(-4.921507823793655, 15.548289819628991)

In [104]:
#去除离群点treatment的数据
iqr3 = 19-3
mins_after_nonoutlier = mins_after.loc[(mins_after<(19+1.5*iqr3))&(mins_after>(3-1.5*iqr3))]

In [105]:
stats.norm.interval(0.95,mins_after_nonoutlier.mean(),mins_after_nonoutlier.std()) # 第二问 95%置信区间计算 计算结果保留两位小数

(-9.149177043659634, 27.2164227060063)

In [None]:
#treatment后停留分钟数显然更长，而且同treatment当日相比仍然保有不错的效果

In [112]:
stats.ttest_rel(mins_before_nonoutlier[:155787],mins_after_nonoutlier) #p value<0.05 故treatment前后停留分钟数存在显著性差异

Ttest_relResult(statistic=-137.2209270545106, pvalue=0.0)

# 使用table4 排除性别、客户类型对实验的影响

In [114]:
new_df1 = pd.merge(df1,df4,on='uid')
new_df3 = pd.merge(df3,df4,on='uid')

## 先比较性别是否对停留时间有显著性差异，再比较不同性别下实验产生的效果

In [117]:
treatment_uid = df2[df2['variant_number'] == 1]['uid']
male_mins = new_df1.loc[new_df1['gender']=='male']['active_mins']
female_mins = new_df1.loc[new_df1['gender']=='female']['active_mins']

In [123]:
stats.ttest_rel(male_mins[:263960],female_mins) #性别对停留时间无显著性差异

Ttest_relResult(statistic=-0.8725215555924346, pvalue=0.38292470768041575)

In [125]:
male_mins_after = new_df1.loc[(new_df1['gender']=='male')&(new_df1['uid'].isin(treatment_uid))]['active_mins']
male_mins_before = new_df3.loc[(new_df3['gender']=='male')&(new_df3['uid'].isin(treatment_uid))]['active_mins']

In [128]:
stats.ttest_rel(male_mins_before[:110320], male_mins_after) #试验对男性停留时间有影响

Ttest_relResult(statistic=-6.555056315082889, pvalue=5.586479902950626e-11)

In [132]:
female_mins_after = new_df1.loc[(new_df1['gender']=='female')&(new_df1['uid'].isin(treatment_uid))]['active_mins']
female_mins_before = new_df3.loc[(new_df3['gender']=='female')&(new_df3['uid'].isin(treatment_uid))]['active_mins']
stats.ttest_rel(female_mins_before[:43759], female_mins_after) #treatment同样可以增加女性停留时长

Ttest_relResult(statistic=-4.336550433612149, pvalue=1.450594656195817e-05)

**treatment 不论男性还是女性，都可以有效提升用户停留时间**

##  read是否是影响停留因素之一，老客户是否比新用户停留时间更长

In [134]:
new_df1.user_type.unique() 

array(['non_reader', 'reader', 'new_user', 'contributor'], dtype=object)

In [135]:
nonread_mins = new_df1.loc[new_df1['user_type']=='non_reader']['active_mins']
read_mins = new_df1.loc[new_df1['user_type']=='reader']['active_mins']

In [138]:
stats.ttest_rel(nonread_mins[:406461],read_mins) 

Ttest_relResult(statistic=-22.45690855518372, pvalue=1.2816296384759628e-111)

In [145]:
nonread_mins.mean() 

8.560384603366929

In [146]:
read_mins.mean()  #显然reader平均停留时间要明显高于non reader

41.09470527307663

In [147]:
new_mins = new_df1.loc[new_df1['user_type']=='new_user']['active_mins']
old_mins = new_df1.loc[new_df1['user_type']=='contributor']['active_mins']

In [150]:
stats.ttest_rel(new_mins,old_mins[:17377]) #新老客户停留时间存在显著性差异

Ttest_relResult(statistic=-6.650785041491005, pvalue=3.0022060042882656e-11)

In [151]:
print('new_mean:',new_mins.mean(),'old_min:',old_mins.mean()) #老客户停留时间显著高于新客户

new_mean: 6.379582206364735 old_min: 253.0313881786735


**为了提升客户粘性，可以将试验更多的投放在non reader和new_user类型的客户上**