In [2]:
!pip install pyarrow
!pip install fastparquet

Collecting pyarrow
  Using cached pyarrow-16.0.0-cp38-cp38-manylinux_2_28_x86_64.whl (40.8 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-16.0.0
Collecting fastparquet
  Using cached fastparquet-2024.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
Collecting cramjam>=2.3
  Using cached cramjam-2.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.8.3 fastparquet-2024.2.0


In [30]:
import pandas as pd
import numpy as np

history = pd.read_parquet('./hm/simulated_data.parquet')
alg1 = pd.read_parquet('./hm/algo/algo_1.parquet')
alg2 = pd.read_parquet('./hm/algo/algo_2.parquet')
alg3 = pd.read_parquet('./hm/algo/algo_3.parquet')
alg4 = pd.read_parquet('./hm/algo/algo_4.parquet')
alg5 = pd.read_parquet('./hm/algo/algo_5.parquet')

history['ds'] = history["ds"].astype(str)
alg1['ds'] = alg1["ds"].astype(str)
alg2['ds'] = alg2["ds"].astype(str)
alg3['ds'] = alg3["ds"].astype(str)
alg4['ds'] = alg4["ds"].astype(str)
alg5['ds'] = alg5["ds"].astype(str)

In [31]:
def combine_history_and_algs(
    df: pd.DataFrame,
    algo1_df: pd.DataFrame,
    algo2_df: pd.DataFrame,
    algo3_df: pd.DataFrame,
    algo4_df: pd.DataFrame,
    algo5_df: pd.DataFrame,
) -> pd.DataFrame:
    def set_markup(
        prefix: str,
        df: pd.DataFrame,
        group_df: pd.DataFrame,
    ) -> pd.DataFrame:
        group_df = group_df.rename(columns={"markup": f"{prefix}_markup"})
        common_cols = list(set(group_df.columns).intersection(set(df.columns)))
        df = df.merge(group_df, how="left", on=common_cols)
        return df

    df = set_markup(
        prefix="algo_1",
        df=df,
        group_df=algo1_df,
    )
    df = set_markup(
        prefix="algo_2",
        df=df,
        group_df=algo2_df,
    )
    df = set_markup(
        prefix="algo_3",
        df=df,
        group_df=algo3_df,
    )
    df = set_markup(
        prefix="algo_4",
        df=df,
        group_df=algo4_df,
    )
    df = set_markup(
        prefix="algo_5",
        df=df,
        group_df=algo5_df,
    )
    return df

In [44]:
comb = combine_history_and_algs(
    history, 
    alg1, 
    alg2,
    alg3,
    alg4,
    alg5,
)

comb

Unnamed: 0,group_1,sku_id,ab_test_id,markup,revenue,traffic,orders_num,ds,algo_1_markup,algo_2_markup,algo_3_markup,algo_4_markup,algo_5_markup
0,group_1000,sku_100000,ab_100000,0.01,2539.41,0.05,1.0,20231008,0.06,0.02,0.01,0.02,0.05
1,group_1001,sku_100001,ab_100001,0.02,6057.44,0.05,3.0,20231008,0.04,0.06,0.04,0.02,0.00
2,group_1002,sku_100002,ab_100002,0.01,541.35,0.05,8.0,20231008,0.06,0.01,0.05,0.01,0.05
3,group_1002,sku_100003,ab_100003,0.06,697.70,0.05,0.0,20231008,0.06,0.01,0.05,0.01,0.05
4,group_1001,sku_100004,ab_100004,0.01,1413.99,0.05,0.0,20231008,0.04,0.06,0.04,0.02,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
511808,group_1002,sku_101093,ab_100019,0.03,178.71,0.05,2.0,20231021,0.05,0.06,0.00,0.03,0.03
511809,group_1012,sku_101412,ab_100013,0.03,88.71,0.05,2.0,20231021,0.06,0.00,0.04,0.00,0.01
511810,group_1024,sku_103568,ab_100017,0.03,55.29,0.05,1.0,20231021,0.05,0.06,0.04,0.02,0.05
511811,group_1001,sku_100697,ab_100004,0.02,8885.78,0.05,1.0,20231021,0.03,0.06,0.03,0.00,0.05


In [109]:
agg_functions = {
    'markup': 'mean',
    'revenue': 'sum',
    'traffic': 'mean',
    'orders_num': 'sum',
    'algo_1_markup': 'mean',
    'algo_2_markup': 'mean',
    'algo_3_markup': 'mean',
    'algo_4_markup': 'mean',
    'algo_5_markup': 'mean',
}

groupd_df = comb.groupby(['ds', 'group_1', 'ab_test_id']).agg(agg_functions).reset_index()
groupd_df['markup'] = groupd_df['markup'].round(2)

groupd_df['orders_num'] = groupd_df['orders_num'] / groupd_df['traffic']

del agg_functions['markup']
groupd_df = groupd_df.groupby(['ds', 'group_1', 'markup']).agg(agg_functions).reset_index()

In [110]:
groupd_df['orders_num'] = groupd_df['orders_num'].round()
groupd_df['algo_1_markup'] = groupd_df['algo_1_markup'].round(2)
groupd_df['algo_2_markup'] = groupd_df['algo_2_markup'].round(2)
groupd_df['algo_3_markup'] = groupd_df['algo_3_markup'].round(2)
groupd_df['algo_4_markup'] = groupd_df['algo_4_markup'].round(2)
groupd_df['algo_5_markup'] = groupd_df['algo_5_markup'].round(2)

striped = groupd_df.drop(['traffic', 'revenue'], axis=1)

striped.head()

Unnamed: 0,ds,group_1,markup,orders_num,algo_1_markup,algo_2_markup,algo_3_markup,algo_4_markup,algo_5_markup
0,20231008,group_1000,0.01,59280000.0,0.06,0.02,0.01,0.02,0.05
1,20231008,group_1000,0.03,123008000.0,0.06,0.02,0.01,0.02,0.05
2,20231008,group_1000,0.04,76200000.0,0.06,0.02,0.01,0.02,0.05
3,20231008,group_1000,0.05,26336000.0,0.06,0.02,0.01,0.02,0.05
4,20231008,group_1001,0.01,83672000.0,0.04,0.06,0.04,0.02,0.0


In [111]:
norm.to_csv('to_karp.csv', index=False)