In [1]:
import pandas as pd

In [2]:
likes = pd.read_csv('likes.csv')
views = pd.read_csv('views.csv')

In [3]:
likes.head()

Unnamed: 0,user_id,post_id,timestamp
0,128381,4704,1654030804
1,146885,1399,1654030816
2,50948,2315,1654030828
3,14661,673,1654030831
4,37703,1588,1654030833


In [4]:
views.head()

Unnamed: 0,user_id,exp_group,recommendations,timestamp
0,128381,control,[3644 4529 4704 5294 4808],1654030803
1,146885,test,[1399 1076 797 7015 5942],1654030811
2,50948,test,[2315 3037 1861 6567 4093],1654030825
3,37703,test,[2842 1949 162 1588 6794],1654030826
4,14661,test,[2395 5881 5648 3417 673],1654030829


### Найдем и удалим пользователей, которые попали сразу в две группы

In [5]:
bad_users = views[views['exp_group']=='control'].merge(views[views['exp_group']=='test'], on='user_id')['user_id'].unique()

In [6]:
for user in bad_users:
    idx = views[views['user_id'] == user].index
    views = views.drop(idx, axis=0)

### Проверим на равенство размеров групп

In [7]:
from scipy.stats import binomtest
bins = views['exp_group'].value_counts().tolist()
print(bins)
binomtest(bins[0], bins[0]+bins[1])

[96898, 96370]


BinomTestResult(k=96898, n=193268, alternative='two-sided', statistic=0.5013659788480245, pvalue=0.23062291448605168)

### Доля пользователей, которая сделала хотя бы один лайк за время эксперимента

In [8]:
users = views.groupby('user_id').agg({'exp_group': 'max'}).reset_index()

In [9]:
users = users.merge(likes.groupby('user_id').agg({'post_id': 'count'}).reset_index().rename(columns={'post_id': 'likes'}), how='left', on='user_id')

In [10]:
print(f'{((users["likes"] > 0).value_counts()[0] / users["likes"].shape[0]).round(3)*100}%')

89.5%


### Оценим, различаются ли число лайков между группами

In [11]:
users['likes'] = users['likes'].fillna(0)

In [12]:
from scipy.stats import mannwhitneyu
res = mannwhitneyu(users[users['exp_group']=='control']['likes'], users[users['exp_group']=='test']['likes'])
print(res)

MannwhitneyuResult(statistic=518358073.0, pvalue=2.9585062792441964e-05)


In [13]:
from statsmodels.stats.weightstats import ztest
ztest(users[(users['exp_group']=='control')]['likes']>0, users[(users['exp_group']=='test')]['likes']>0)

(-2.844158090303926, 0.004452894813985724)

### Посчитаем общий hitrate

In [14]:
df = views.merge(likes, on='user_id')
df['like'] = 1

In [15]:
df = df[df['timestamp_x'] < df['timestamp_y']]

In [16]:
hour = 60*60
df = df[df['timestamp_y'] - df['timestamp_x'] < hour]

In [17]:
df.loc[df['timestamp_y'] - df['timestamp_x'] > hour, 'like'] = 0

In [18]:
df['post_id'] = df['post_id'].astype('str')

In [19]:
df = df[df.apply(lambda row: row['post_id'] in row['recommendations'], axis=1)]

In [20]:
df

Unnamed: 0,user_id,exp_group,recommendations,timestamp_x,post_id,timestamp_y,like
0,128381,control,[3644 4529 4704 5294 4808],1654030803,4704,1654030804,1
1,128381,control,[3644 4529 4704 5294 4808],1654030803,5294,1654030838,1
16,128381,control,[3029 3608 2542 4165 3490],1655049326,3608,1655049327,1
17,128381,control,[3029 3608 2542 4165 3490],1655049326,2542,1655049342,1
18,128381,control,[3029 3608 2542 4165 3490],1655049326,4165,1655052806,1
...,...,...,...,...,...,...,...
1006353,80500,test,[1588 4629 1457 3691 1688],1655240212,4629,1655240282,1
1006354,149686,test,[3203 6437 3681 2678 1629],1655240216,2678,1655240218,1
1006355,3615,control,[1578 6326 906 2833 993],1655240322,1578,1655242450,1
1006356,119630,test,[3143 599 4588 1577 7077],1655240337,599,1655240367,1


In [21]:
recs = df.groupby('recommendations').agg({'like': 'sum', 'exp_group': 'max'}).reset_index()

In [22]:
recs.head(3)

Unnamed: 0,recommendations,like,exp_group
0,[ 1 3 6222 4264 3634],2,control
1,[ 1 321 3584 2533 4046],2,control
2,[ 1 590 2678 5482 1040],1,test


In [23]:
recs_no_like = pd.DataFrame(views[['recommendations', 'exp_group']][~views['recommendations'].isin(recs['recommendations'])])
recs_no_like['like'] = 0

In [24]:
recs_no_like.head(3)

Unnamed: 0,recommendations,exp_group,like
7,[3837 6786 998 2122 7083],test,0
8,[5726 1821 2380 1459 5415],test,0
10,[4889 3374 1344 6754 2419],control,0


In [25]:
data = pd.concat([recs, recs_no_like])

In [26]:
print(f'общий hitrate по всем юзерам: {(data["like"]>0).value_counts(normalize=True)[0].round(2)}')

общий hitrate по всем юзерам: 0.71


### Сравним hitrate в двух группах

In [27]:
import hashlib

data['bucket'] = data['recommendations'].apply(
    lambda x: int(hashlib.md5((str(x) + 'hitrate_salt').encode()).hexdigest(), 16) % 100
) 

In [28]:
data.loc[data['like'] != 0, 'like'] = 1

In [29]:
bucket_data = data.groupby(['exp_group', 'bucket']).agg({'like': 'mean'}).reset_index()

In [30]:
mannwhitneyu(bucket_data[bucket_data['exp_group']=='control']['like'], bucket_data[bucket_data['exp_group']=='test']['like'])

MannwhitneyuResult(statistic=2595.5, pvalue=4.256198788175318e-09)

In [31]:
data.groupby('exp_group').agg({'like': 'mean'}).reset_index().rename(columns={'like': 'hitrate'})

Unnamed: 0,exp_group,hitrate
0,control,0.706651
1,test,0.719839


## Вывод: В тестовой группе hitrate выше на 1 п.п. (до бакетирования), есть статистически значимая разница