In [1]:
import warnings
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

from business_models import greenplum
from business_models.greenplum import GreenplumManager

In [2]:
warnings.filterwarnings('ignore')

In [3]:
greenplum = GreenplumManager(user='ghostrider', token = greenplum.token)

In [4]:
data_sql = '''
    with ranges as (
        select (gs - interval '1 year')::date window_start, gs::date window_finish
        from generate_series('2022-07-01', '2023-07-01', interval '1 month') as gs),
    data as (
        select
            r.window_finish,
            o.user_phone_pd_id,
            sum(o.delivery_point_cnt) deliveries
        from taxi_cdm_order.fct_order o
        join ranges r on o.utc_order_created_dt between r.window_start and r.window_finish
        where 1=1
          and tariff_class_code in ('cargo', 'cargocorp','cargo_long', 'cargo_express')
          and utc_order_created_dt >= '2021-07-01'

          and user_phone_pd_id is not null
          and delivery_point_cnt > 0

          and (app_name in ('persey', 'pool', 'iphone', 'android')
            or app_user_agent like '%%yango%%'
            or app_name in ('yango_android','yango_iphone')
            or order_source_code in ('uber_old','uber','yauber'))

          and agglomeration_node_id = 'br_moscow'
        group by 1, 2
        having sum(o.delivery_point_cnt) >= 5),
    data_agg as (
        select
            window_finish as period,
            case
                when deliveries < 10 then '05'
                when deliveries >= 50 then '50+'
                    else (round(deliveries/ 10) * 10)::text
        end bucket,
            count(distinct user_phone_pd_id) users
        from data
        group by 1, 2)
    select d.*,
           d.users / sum(d.users) over (partition by d.period) period_share
    from data_agg d
    order by 1, 2
'''

In [15]:
df = greenplum(data_sql)

In [16]:
df['period_share_p'] = df['period_share'].apply(lambda x: str(int(round(x, 2)*100)) +' %')

In [17]:
df

Unnamed: 0,period,bucket,users,period_share,period_share_p
0,2022-07-01,05,36488,0.545901,55 %
1,2022-07-01,10,16713,0.250045,25 %
2,2022-07-01,20,5476,0.081927,8 %
3,2022-07-01,30,2726,0.040784,4 %
4,2022-07-01,40,1582,0.023668,2 %
...,...,...,...,...,...
73,2023-07-01,10,23217,0.254447,25 %
74,2023-07-01,20,7879,0.086350,9 %
75,2023-07-01,30,3914,0.042896,4 %
76,2023-07-01,40,2400,0.026303,3 %


In [23]:
dates = df['period'].unique().tolist()

fig = px.line(df, x="period", y="users", color="bucket",
              template = 'plotly_white',
              text="users",
              title='Frequent Users Dynamic by Bucket')


for d in dates:
    fig.add_vline(x=d, line_width=1, line_dash="dash", line_color="green")

fig.update_traces(textposition="bottom right")

fig.write_image(f"/Users/ghostrider/projects/jupyter_working/fig_smb_dynamic.png")

fig.show()

In [24]:
fig = px.bar(df, 
             x="period", y="period_share", color="bucket",
             text="period_share_p",
             title="Share of Buckets Dynamic")
fig.update_traces(textposition="inside")

fig.write_image(f"/Users/ghostrider/projects/jupyter_working/fig_smb_share.png")

fig.show()

In [26]:
df.to_excel(r'SMB_frequency_estimation.xlsx', sheet_name='data', index = False)