In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

# load data

In [2]:
root = Path('data/processed/')

In [3]:
orders_count = pd.read_csv(root / 'orders_count.csv')
orders_count.head(3)

Unnamed: 0,date,count,product_name,city_id
0,2017-01-01,1,fit,2
1,2017-01-03,144,fit,2
2,2017-01-04,0,fit,2


In [4]:
new_orders_count = pd.read_csv(root / 'new_orders_count.csv')
new_orders_count.head(3)

Unnamed: 0,date,count,product_name,city_id
0,2017-01-03,22,fit,2
1,2017-01-04,0,fit,2
2,2017-01-06,4,fit,2


In [5]:
fact_deliveries_count = pd.read_csv(root / 'fact_deliveries_count.csv')
fact_deliveries_count.head(3)

Unnamed: 0,count,planned_delivery_date,city_id,production_type_id,name
0,34,2022-10-27,2,185,elementaree_1
1,168,2022-10-26,1,19,balance_2_2
2,1,2022-10-26,1,20,detox_2_2


In [6]:
f = lambda s: s.split('_')[0]
fact_deliveries_count = fact_deliveries_count.assign(product_name=fact_deliveries_count.name.apply(f))
fact_deliveries_count.head(3)

Unnamed: 0,count,planned_delivery_date,city_id,production_type_id,name,product_name
0,34,2022-10-27,2,185,elementaree_1,elementaree
1,168,2022-10-26,1,19,balance_2_2,balance
2,1,2022-10-26,1,20,detox_2_2,detox


# make dataset

In [7]:
city_id = 1
product = 'balance'

df1 = orders_count[(orders_count.city_id == city_id) & (orders_count.product_name == product)]
df2 = new_orders_count[(new_orders_count.city_id == city_id) & (new_orders_count.product_name == product)]
df3 = fact_deliveries_count[(fact_deliveries_count.city_id == city_id) & 
                            (fact_deliveries_count.product_name == product)] \
            .rename({'planned_delivery_date': 'date'}, axis=1) \
            .drop_duplicates('date')
cols = ['date', 'count']

df = pd.merge(df1[cols], df2[cols], on='date')
df = df.rename(dict(count_x='orders_count',
                    count_y='new_orders_count'), axis=1)
df = pd.merge(df, df3[cols], on='date').rename(dict(count='deliveries_count'), axis=1)
df.sort_values('date', inplace=True)
df.head(3)

Unnamed: 0,date,orders_count,new_orders_count,deliveries_count
0,2017-08-13,20,2,290
1,2017-08-16,15,8,343
2,2017-08-18,45,21,130


In [8]:
def fill_nan_dates(subdf: pd.DataFrame) -> pd.DataFrame:
    d = subdf.copy()
    d.index = pd.to_datetime(d.date)
    d = d.drop('date', axis=1)

    d = d.asfreq('1d')
    d.fillna(method='ffill', inplace=True)
    d = d.reset_index()
    return d

df = fill_nan_dates(df)
df = df.iloc[-200:]

In [9]:
df = df.rename(dict(date='timestamp', deliveries_count='target'), axis=1) \
       .assign(segment='main')
df.head()

Unnamed: 0,timestamp,orders_count,new_orders_count,target,segment
1701,2022-04-10,78.0,4.0,294.0,main
1702,2022-04-11,78.0,5.0,181.0,main
1703,2022-04-12,92.0,6.0,298.0,main
1704,2022-04-13,64.0,5.0,209.0,main
1705,2022-04-14,78.0,9.0,285.0,main


In [10]:
df_raw = df.copy()

In [11]:
df_raw.head()

Unnamed: 0,timestamp,orders_count,new_orders_count,target,segment
1701,2022-04-10,78.0,4.0,294.0,main
1702,2022-04-11,78.0,5.0,181.0,main
1703,2022-04-12,92.0,6.0,298.0,main
1704,2022-04-13,64.0,5.0,209.0,main
1705,2022-04-14,78.0,9.0,285.0,main


# corr

In [19]:
from scipy.stats.stats import pearsonr

for lag in range(0, 14):
    x = df_raw.target
    y = df_raw.new_orders_count
    
    if lag != 0:
        x = x.iloc[:-lag]
        y = y.iloc[lag:]
    
    corr, p = pearsonr(x, y)
    
    if p < 0.05:
        print(f'{lag} {corr:.2f} {p:.3f}')

2 0.16 0.021
7 0.17 0.019
9 0.20 0.005


In [18]:
from scipy.stats.stats import pearsonr

for lag in range(0, 14):
    x = df_raw.target
    y = df_raw.orders_count
    
    if lag != 0:
        x = x.iloc[:-lag]
        y = y.iloc[lag:]
    
    corr, p = pearsonr(x, y)
    if p < 0.05:
        print(f'{lag} {corr:.2f} {p:.3f}')

0 0.43 0.000
5 0.36 0.000
6 -0.15 0.041
7 0.27 0.000
12 0.28 0.000
13 -0.22 0.002
