In [14]:
import pandas as pd
import numpy as np

Доступные данные:
- data/clients.csv — информация о клиентах
- data/products.csv — информация о товарах
- data/purchases.csv — история покупок клиентов до смс кампании
- data/uplift_train.csv — обучающая выборка клиентов, информация о коммуникации и конверсии
- data/uplift_test.csv — тестовые клиенты, для которых необходимо оценить uplift

In [19]:
clients = pd.read_csv('./large_data/init_data//clients.csv')
products = pd.read_csv('./large_data/init_data/products.csv')
purchases = pd.read_csv('./large_data/init_data/purchases.csv')

In [21]:
aov_per_user = purchases[['client_id', 'transaction_id', 'purchase_sum']].drop_duplicates()\
                                                                         .groupby('client_id', as_index=False)\
                                                                         .purchase_sum.mean()

In [22]:
clients['city_type'] = (aov_per_user.purchase_sum >= aov_per_user.purchase_sum.median()).astype(int)

In [23]:
aov_per_user.merge(clients[['client_id', 'city_type']], on = 'client_id').groupby('city_type').purchase_sum.median() # средние aov по клиентам

city_type
0    261.696667
1    609.185238
Name: purchase_sum, dtype: float64

In [57]:
clients.to_csv('./large_data/raw_data/clients.csv', index=False)

**Делим на rich и poor**

In [26]:
purchases = purchases.merge(clients.drop(columns = ['first_issue_date', 'first_redeem_date']), on = 'client_id')

**Делим на Treatment и Control**

In [28]:
def split_by_client_id(df):
    unique_clients = df['client_id'].unique()

    np.random.shuffle(unique_clients)

    split_idx = len(unique_clients) // 2
    clients_group_0 = unique_clients[:split_idx]
    clients_group_1 = unique_clients[split_idx:]

    df['treatment_flg'] = -1

    df.loc[df['client_id'].isin(clients_group_0), 'treatment_flg'] = 0
    df.loc[df['client_id'].isin(clients_group_1), 'treatment_flg'] = 1
    
    # посчитаем колличество
    count_group_0 = df[df['treatment_flg'] == 0]['client_id'].nunique()
    count_group_1 = df[df['treatment_flg'] == 1]['client_id'].nunique()

    print(f"Количество клиентов в группе 0: {count_group_0}")
    print(f"Количество клиентов в группе 1: {count_group_1}")

    return df


In [29]:
rich = split_by_client_id(purchases.query('city_type == 1'))
poor = split_by_client_id(purchases.query('city_type == 0'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['treatment_flg'] = -1


Количество клиентов в группе 0: 100040
Количество клиентов в группе 1: 100041


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['treatment_flg'] = -1


Количество клиентов в группе 0: 100040
Количество клиентов в группе 1: 100041


In [55]:
rich.to_csv('./large_data/raw_data/rich_raw.csv', index=False)
poor.to_csv('./large_data/raw_data/poor_raw.csv', index=False)