In [1]:
import warnings
import calendar

import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from business_models import greenplum
from business_models.greenplum import GreenplumManager
from business_models import hahn

https://wiki.yandex-team.ru/users/ghostrider/cargo-x-efficiency/.edit?force-data-ui=true

In [None]:
fct_supply_state_hist_sql = '''
    with t as (
        select gs::timestamp  time_slot,
            case when extract(isodow from gs) >= 6 then 1 else 0 end is_weekend,
               to_char(gs, 'YYYY-MM') mnth
        from generate_series('2023-05-01', '2023-07-31', interval '1 hour') as gs)
    select
                agglomeration_geo_node_id, time_slot, is_weekend, mnth,
                park_taximeter_id || '_' || executor_profile_id dbid_uuid,
                sum(duration_sec) / 3600 sh,
                sum(case when executor_status_code != 'free' then duration_sec else 0 end) / 3600 util_time

    from taxi_cdm_supply.fct_supply_state_hist s
    join t on t.time_slot between s.msk_valid_from_dttm and s.msk_valid_to_dttm
    where enabled_tariff_class_code_list && '{"cargo", "cargocorp","cargo_long", "cargo_express"}'
      and agglomeration_geo_node_id in ('br_moscow', 'br_saintpetersburg', 'br_krasnodar', 'br_novosibirsk')
      and msk_valid_from_dttm::date between '2023-05-01' and '2023-07-31'
      and executor_status_code in ('free', 'waiting', 'driving', 'transporting')
    group by 1, 2, 3, 4, 5
'''

In [None]:
supply_state = greenplum(fct_supply_state_hist_sql)

In [2]:
query = '''
    use hahn;

    $mph_dynamics = '//home/taxi-delivery/analytics/dev/gennadyand/sub_util/mph_dynamics_new/2023-07-01';

    select park_taximeter_id, executor_profile_id, local_dt_str, 
        sum(driver_net_inflow) as driver_net_inflow,
        sum(total_sh) as total_sh,
        sum(driving_sh + waiting_sh + transporting_sh) as effective_sh
    from $mph_dynamics
    where 1=1
        and agglomeration = 'Moscow'
        and courier_type = '6. cargo_courier'
    group by park_taximeter_id, executor_profile_id, local_dt_str
    ;
        '''

In [3]:
df = hahn(query)

Execution is in progress [1m\[0m[90m 1 minute and 15 seconds elapsed[0m     

In [4]:
df['mph'] = df['driver_net_inflow'] / df['total_sh']

In [5]:
df

Unnamed: 0,park_taximeter_id,executor_profile_id,local_dt_str,driver_net_inflow,total_sh,effective_sh,mph
0,b9dd45fbeb194c8996c6661deefed8d1,126edb382e424d928c6936c74b8d4826,2023-07-10,0.00000,0.092500,0.000000,0.000000
1,8de000cd8fdb4b20962c6e9038a204cd,aa421fa6b796478c8f6652806c328fdd,2023-07-08,0.00000,0.077778,0.000000,0.000000
2,31591f37750a4549bd7fb0988ad8be27,42af251958acdff4a77f66d4c1915dbe,2023-07-07,0.00000,0.009444,0.000000,0.000000
3,eaea0a8e6b5248f6b6ebaa398b11a2ca,fdbfb3f62b99436295d76bff025ce994,2023-07-15,0.00000,2.890278,0.000000,0.000000
4,815777309a8643fca88ebe1acc510c6c,c60befcfbfb048888f953e3d0bfdb2eb,2023-07-17,0.00000,4.401944,0.000000,0.000000
...,...,...,...,...,...,...,...
194357,7650a20210e1400da0dc6b499924b1ed,2cb51300b586dc010732b178960549f4,2023-07-26,9831.22016,11.277500,10.009444,871.755279
194358,31c987be40c140f08f8e224a1070be97,863fdf7bb21e4b0c81f9bc63421a46ce,2023-07-14,7453.50000,14.021389,6.172500,531.580720
194359,2a503d46a3ff45b49b05cf24de901f30,c6b690c0616148979badcdbc02f3a0c6,2023-07-21,4584.10600,13.338611,4.799444,343.671913
194360,9b9696ed84794eed8c877b4a83c8f76e,edbeb0c14c594847ae4971a763bb9013,2023-07-14,0.00000,0.045278,0.000000,0.000000


In [6]:
driver_net_inflow = df[df['driver_net_inflow']>0].driver_net_inflow
mph = df[df['mph']>0].mph

In [7]:
driver_net_inflow.mean()

4584.941677685682

In [8]:
{i:int(np.quantile(driver_net_inflow, i)) for i in [i/10 for i in range(10)]}

{0.0: 1,
 0.1: 1210,
 0.2: 1897,
 0.3: 2586,
 0.4: 3325,
 0.5: 4103,
 0.6: 4863,
 0.7: 5748,
 0.8: 6848,
 0.9: 8437}

In [9]:
mph.mean()

673.2783779158453

In [10]:
{i:int(np.quantile(mph, i)) for i in [i/10 for i in range(10)]}

{0.0: 0,
 0.1: 268,
 0.2: 363,
 0.3: 437,
 0.4: 500,
 0.5: 559,
 0.6: 621,
 0.7: 694,
 0.8: 795,
 0.9: 989}

In [11]:
df_agg = df[df['driver_net_inflow']>0].groupby(['park_taximeter_id', 'executor_profile_id'])[
    'driver_net_inflow','total_sh','effective_sh'].sum().reset_index()

df_agg['mph'] = df_agg['driver_net_inflow'] / df_agg['total_sh']

  df_agg = df[df['driver_net_inflow']>0].groupby(['park_taximeter_id', 'executor_profile_id'])[


In [12]:
df_agg = df_agg[df_agg.mph < df_agg.mph.mean() + df_agg.mph.std()*3]

In [13]:
df_agg.sort_values(by='mph')

Unnamed: 0,park_taximeter_id,executor_profile_id,driver_net_inflow,total_sh,effective_sh,mph
8844,b6b7ac60169741879aa0acf018548f14,8fe8333df11d42c78ae859ad30b3e96e,64.08000,3.461389,0.323611,18.512800
4655,612aadb1ef0d455d9682a77603524039,f91e9493bb4a4d2ea6d3ce02e204b1e4,102.40000,3.418889,0.251111,29.951251
11095,d60bb197c2eb4d19bb4c1f08dcd3aea6,5cf1942a09e4318a7fa9684ca9370efe,353.92500,10.766389,0.739722,32.873139
7683,9fcaaf0c56284dbcbe6174a14c293865,cd95890b88a94af38eeed1b24dddb6d1,630.41000,15.772778,1.130556,39.968229
12805,f434953455af4784b3ebfd3bad5a4259,fd03b12f80a5de5dc80a9555153e6645,89.67696,2.223333,0.456389,40.334465
...,...,...,...,...,...,...
9584,c3bb0128e5d64c8bbccee3921bddb859,17c5c02b0f1646da9cebbf4f8d295c06,3765.00000,0.328889,0.261389,11447.635135
8975,b6ca15e102164cd3910633e558323dc9,79f681554fdf4e078cec96c43a0f64f6,30186.85800,2.356944,2.223611,12807.623901
9416,c1843a1333e74614b036bd8e5efccb63,4db721d44c2845ffaf938101b06958ba,1645.00000,0.119444,0.020278,13772.093023
4585,60d642975dfd4e259de5569c51a7ae63,541f5aa6d58d496484330fb2fcd85e3a,6830.98000,0.475278,0.216111,14372.605494


In [None]:
sns.histplot(df_agg["mph"], binwidth=0.05)

In [14]:
fig = px.histogram(df_agg, x="mph", nbins=2000)
fig.show()

In [15]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

In [None]:
import plotly.io as pio
pio.renderers.default = 'plotly_mimetype'

In [None]:
tips_df = sns.load_dataset('tips')
tips_df.head()