Загружаем исходные данные в colab.

In [1]:
from google.colab import files
uploaded = files.upload()

Saving dataset(1).csv to dataset(1).csv


Загружаем данные в датафрейм

In [None]:
import io
import pandas as pd
import numpy as np
import plotly.express as px

df = pd.read_csv(io.BytesIO(uploaded['dataset.csv']))

df

Unnamed: 0,event_date,app_version,country_code,install_date,media_source,campaign_name,appsflyer_id,is_organic,payments,revenue_usd,ad_revenue_usd,sessions,playtime
0,2022-04-09,0.1.75.1,SG,2022-04-09,bytedanceglobal_int,EM_AND_SEA_Purchase_IT2_0604,1649548002569-6720420720726321605,False,0,0.0,0.000000,1,4
1,2022-04-09,0.1.75.1,US,2022-04-09,bytedanceglobal_int,EM_AND_US_Purchase_IT1_2703,1649545238733-4551347935758619924,False,0,0.0,0.000000,1,2
2,2022-04-09,0.1.75.1,US,2022-04-09,bytedanceglobal_int,EM_AND_US_Purchase_IT1_2703,1649540322226-5366969563919899948,False,0,0.0,0.000000,1,2
3,2022-04-09,0.1.75.1,UK,2022-04-07,bytedanceglobal_int,EM_AND_T1_Purchase_IT2_0604,1649337475698-617904100123526593,False,0,0.0,0.000000,1,1
4,2022-04-09,0.1.75.1,US,2022-04-09,bytedanceglobal_int,EM_AND_US_Purchase_IT1_2703,1649544985557-9160306254813664580,False,0,0.0,0.000000,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
690370,2022-07-31,,PH,2022-07-28,organic,organic,1659003289354-9130797695840500796,True,0,0.0,0.002614,1,1
690371,2022-07-31,,AE,2022-07-28,organic,organic,1659020308480-7169513513931602186,True,0,0.0,0.012164,1,1
690372,2022-07-31,,DZ,2022-07-29,organic,organic,1659059933950-196412188438371439,True,0,0.0,0.000195,1,1
690373,2022-07-31,,FR,2022-07-29,organic,organic,1659086493021-6361713490086303799,True,0,0.0,0.021129,1,1


In [None]:
df['event_date'] = pd.to_datetime(df['event_date'])
df['install_date'] = pd.to_datetime(df['install_date'])
df.dtypes

event_date        datetime64[ns]
app_version               object
country_code              object
install_date      datetime64[ns]
media_source              object
campaign_name             object
appsflyer_id              object
is_organic                  bool
payments                   int64
revenue_usd              float64
ad_revenue_usd           float64
sessions                   int64
playtime                   int64
dtype: object

Сегодня мы будем анализировать платящих игроков, рассчитаем PPU и CR, а также иные метрики платящей аудитории проекта.

Начнем с CR - конверсии в платящего.

In [None]:
df_payers = df[df['install_date'] >= '2022-04-01'].groupby(['appsflyer_id','install_date'], as_index = False).agg({'payments':'sum'})
df_payers = df_payers[df_payers['payments']>0]
df_payers = df_payers.groupby('install_date', as_index = False).agg({'appsflyer_id':'nunique'}).rename(columns = {'appsflyer_id':'payers'})

df_payers

Unnamed: 0,install_date,payers
0,2022-04-01,19
1,2022-04-02,28
2,2022-04-03,21
3,2022-04-04,7
4,2022-04-05,4
...,...,...
117,2022-07-27,27
118,2022-07-28,30
119,2022-07-29,34
120,2022-07-30,40


In [None]:
df_installs = df[df['install_date'] >= '2022-04-01'].groupby('install_date', as_index=False).agg({'appsflyer_id':'nunique'}).rename(columns = {'appsflyer_id':'installs'})
df_installs

Unnamed: 0,install_date,installs
0,2022-04-01,1010
1,2022-04-02,1024
2,2022-04-03,896
3,2022-04-04,311
4,2022-04-05,216
...,...,...
117,2022-07-27,3403
118,2022-07-28,3547
119,2022-07-29,4197
120,2022-07-30,4035


In [None]:
df_cr = pd.merge(df_installs, df_payers, on='install_date', how='left')
df_cr['payers'] = df_cr['payers'].fillna(0)
df_cr['cr'] = round(df_cr['payers'] / df_cr['installs'], 2)

df_cr

Unnamed: 0,install_date,installs,payers,cr
0,2022-04-01,1010,19,0.02
1,2022-04-02,1024,28,0.03
2,2022-04-03,896,21,0.02
3,2022-04-04,311,7,0.02
4,2022-04-05,216,4,0.02
...,...,...,...,...
117,2022-07-27,3403,27,0.01
118,2022-07-28,3547,30,0.01
119,2022-07-29,4197,34,0.01
120,2022-07-30,4035,40,0.01


In [None]:
fig_cr = px.line(df_cr, x='install_date', y='cr')
fig_cr.show()

Что можно сказать о динамике конверсии в платящего?

Теперь рассчитаем PPU - посмотрим чем отличается расчет PPU от CR и будут ли отличаться значения метрик для исследуемого проекта.

In [None]:
df_dau = df.groupby('event_date', as_index=False).agg({'appsflyer_id':'nunique'}).rename(columns = {'appsflyer_id':'dau'})
df_dau

Unnamed: 0,event_date,dau
0,2022-04-01,1452
1,2022-04-02,1526
2,2022-04-03,1419
3,2022-04-04,768
4,2022-04-05,623
...,...,...
117,2022-07-27,8207
118,2022-07-28,8263
119,2022-07-29,8970
120,2022-07-30,9066


In [None]:
df_payers_day = df[df['payments']>0].groupby('event_date', as_index=False).agg({'appsflyer_id':'nunique'}).rename(columns = {'appsflyer_id':'payers'})
df_ppu = pd.merge(df_dau, df_payers_day, on='event_date', how='left')

df_ppu['payers'] = df_ppu['payers'].fillna(0)
df_ppu['ppu'] = round(df_ppu['payers'] / df_ppu['dau'], 2)

df_ppu

Unnamed: 0,event_date,dau,payers,ppu
0,2022-04-01,1452,23,0.02
1,2022-04-02,1526,22,0.01
2,2022-04-03,1419,19,0.01
3,2022-04-04,768,14,0.02
4,2022-04-05,623,8,0.01
...,...,...,...,...
117,2022-07-27,8207,57,0.01
118,2022-07-28,8263,60,0.01
119,2022-07-29,8970,60,0.01
120,2022-07-30,9066,62,0.01


In [None]:
fig_ppu = px.line(df_ppu, x='event_date', y='ppu')
fig_ppu.show()

Есть ли отличия в значениях двух метрик? Какое принципиальное отличие в методике их расчета?

Попробуем проанализировать особенности платежного поведения пользователей.

In [None]:
df.describe()

Unnamed: 0,payments,revenue_usd,ad_revenue_usd,sessions,playtime
count,690375.0,690375.0,690375.0,690375.0,690375.0
mean,0.015702,0.062068,0.010555,1.287041,24.29406
std,0.180256,0.88888,0.052963,0.685388,37.562079
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,2.0
50%,0.0,0.0,0.0,1.0,11.0
75%,0.0,0.0,0.003799,1.0,31.0
max,22.0,211.210604,6.128051,13.0,719.0


Из приведенной таблицы сразу можем сделать вывод - какое самое большое число покупок, совершенных одним пользователем за день в игре, а также самая большая сумма.

In [None]:
df_best = df.groupby(['appsflyer_id', 'is_organic', 'country_code'], as_index = False).agg({'payments':'sum', 'revenue_usd':'sum'})
df_best = df_best.sort_values(by = 'revenue_usd', ascending=False)
df_best.head(10)

Unnamed: 0,appsflyer_id,is_organic,country_code,payments,revenue_usd
156202,1656052075202-4518443352590337226,True,RU,17,269.064449
23169,1651407578447-4223985073839630071,True,US,24,190.76
37474,1652460444680-997227337943264331,True,FR,11,100.588233
199115,1656622120404-7816762096815298461,True,MX,15,84.386133
230696,1657050642646-3669986192959330920,True,US,9,68.91
48286,1652937734679-205832070123344377,True,MV,4,57.1
213246,1656826849728-3883799672048849262,True,ID,15,50.09059
241264,1657208090786-7783441818002846755,False,US,11,48.89
69137,1653762909314-2597579198731937397,True,MY,5,48.487293
76130,1654014951877-3045588186806896570,True,UK,6,43.670595


Какие выводы можем сделать о топ-10 платящих игроков за исследуемый период в игре?

In [None]:
df_geo = df.groupby('country_code', as_index=False).agg({'appsflyer_id':'nunique', 'revenue_usd':'sum', 'payments':'sum'}).rename(columns = {'appsflyer_id':'users'})
df_geo = df_geo.sort_values(by = 'revenue_usd', ascending=False)
df_geo.head(10)

Unnamed: 0,country_code,users,revenue_usd,payments
214,US,50652,17661.187734,3930
212,UK,10176,2764.555323,663
52,DE,10832,2196.258972,500
69,FR,12101,1850.688861,346
34,CA,4281,1464.428101,327
92,ID,17077,1463.859155,543
147,MY,14038,1450.416814,657
160,PE,951,1138.952969,220
11,AU,2514,1124.813919,227
163,PH,30911,893.948975,442


Сравните распределение топ стран с топом по пользователям. Какие выводы можно сделать?

In [None]:
df_geo['arpu'] = df_geo['revenue_usd'] / df_geo['users']
df_geo['payments_per_user'] = df_geo['payments'] / df_geo['users']
df_geo['average_check'] = df_geo['revenue_usd'] / df_geo['payments']

df_geo.head(10)

Unnamed: 0,country_code,users,revenue_usd,payments,arpu,payments_per_user,average_check
214,US,50652,17661.187734,3930,0.348677,0.077588,4.493941
212,UK,10176,2764.555323,663,0.271674,0.065153,4.169767
52,DE,10832,2196.258972,500,0.202757,0.04616,4.392518
69,FR,12101,1850.688861,346,0.152937,0.028593,5.348812
34,CA,4281,1464.428101,327,0.342076,0.076384,4.478373
92,ID,17077,1463.859155,543,0.085721,0.031797,2.695873
147,MY,14038,1450.416814,657,0.103321,0.046802,2.207636
160,PE,951,1138.952969,220,1.197637,0.231335,5.177059
11,AU,2514,1124.813919,227,0.44742,0.090294,4.955127
163,PH,30911,893.948975,442,0.02892,0.014299,2.022509


Какие выводы можно сделать об особенностях поведения платящих игроков с учетом новых метрик? Есть ли в полученных результатах аномалии?

# Домашнее задание

Проанализируйте особенности CR и PPU для органики / неорганики / различных маркетинговых каналов / гео. Какие особенности платящей аудитории проекта вы видите? Какие предложения по улучшению монетизации игры можно сделать при наличии таких данных?

Конверсия проходит в бльшенстве случев у игроков которрые находятся 1,5-2 месяца в игре.

Со временем количество неплатящих  игроков увеличилось, что привело к снижению дохода. Это может быть вязано с регионом в котором снижено кличество плательщиков. Для получения большей информации о снижении количество конверсии необходимо знать, об изменениях внутри игры.

Необходимо проерять оставерность платежей, смотря на суммы и страны из которых они поступают.
