https://jira.x5.ru/browse/CVMXC-2732

In [1]:
name = 'CVMXC-2732_chudo'

In [2]:
import sys
import pandas as pd
import pyspark.sql.functions as F
sys.path.append('/home/jovyan/glow-byte-filters-pyspark')
sys.path.append('/home/jovyan/x5_some_tasks/Osipov/')
import datetime
from datetime import timedelta
from spark import *
from utils_osipov import *
from logic_filters import *
from segmentation import *

%load_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = 500

In [3]:
sc, spark = restart_spark(name, 100, executor_memory="3G", executor_cores=5, driver_memory="12G",
                          additional_params={"spark.sql.shuffle.partitions": "200"})
sc.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
LOYALTY_CARDS = "hive_ssa_tc5.loyalty_card"
LOYALTY_CARDHOLDERS = "hive_ssa_tc5.loyalty_cardholder"
ACCOUNTS = "hive_ssa_tc5.account"
CVM5_GUESTS = "hive_cvm_acrm.cvm5_guests"

DIM_STORE = "hive_ssa_main.dim_store"
CHECKS_HEADERS = "hive_ssa_main.fct_rtl_txn"
CHECKS_ITEMS = "hive_ssa_main.fct_rtl_txn_item"
PRODUCTS = "hive_ssa_tc5.cvm_product"

### Выбираем гостей нужного юзкейса

In [5]:
usecase = ['upgrade', 'cross', 'ump']
dt = datetime.date.today()
lifetime = 90
freq = 1

In [6]:
customers_usecase = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase))
                     .filter(F.col('lifetime') >= lifetime)
                     .filter(F.col('frequency') >= freq)
                     .select('account_no', 'customer_rk')
                            )


In [7]:
print(customers_usecase.count()) #16775430



15874480


                                                                                

### 1 проверяем на доступность отобранных гостей в определенную дату

In [8]:
check_date = '2022-03-10'

In [9]:
seg_sms1 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[0], 
                                         check_date=check_date, 
                                         debug_mode=False
                                        )
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:16


In [10]:
seg_sms2 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[1], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:07


In [11]:
seg_sms3 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[2], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:11


In [12]:
seg_sms = seg_sms1.union(seg_sms2).union(seg_sms3).distinct()

In [13]:
seg_sms.write.parquet('temp2732_1', mode='overwrite')
seg_sms = spark.read.parquet('temp2732_1')

                                                                                51]]]]

In [14]:
seg_sms.count() #

                                                                                

6678147

In [15]:
slip1 = (slip_check_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[0], 
                                         check_date=check_date, 
                                         debug_mode=False
                                        )
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:01


In [16]:
slip2 = (slip_check_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[1], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:02


In [17]:
slip3 = (slip_check_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[2], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:01


In [18]:
slip = slip1.union(slip2).union(slip3).distinct()

In [19]:
slip.write.parquet('temp2732_2', mode='overwrite')
slip = spark.read.parquet('temp2732_2')

                                                                                 200]]

In [20]:
slip.count() #

                                                                                

13280251

### Проверка на доступность плю в магазинах

Воронеж и Воронежская область

In [21]:
plu_codes = [3628396, 3628397, 3628398, 3628399, 3628400, 3628404, 3644386, 3644385, 3933627, 
             4037142, 4102133, 4108181, 4192985, 4200706]

In [22]:
federal_subject_dk = [36]

In [23]:
pplu = spark.createDataFrame(pd.DataFrame({"plu_code": plu_codes}))

shops = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject_dk))
            .selectExpr('store_id as plant')
            .distinct()
            )


tc5_stores_assort = (
    spark.table('HIVE_SSA_MAIN.ASSORTMENT_X_PLU_X_STORE')
    .withColumnRenamed('plu_id', 'plu_code')
    .withColumnRenamed('store_id', 'plant')
    .filter(F.col("plu_negate_flg") != 1)
    .selectExpr('plu_code', 'plant')
    .join(F.broadcast(pplu), 'plu_code', 'inner')
)

plants = (shops.join(tc5_stores_assort, on='plant', how='inner')
                .select('plant')
                .distinct()
                .toPandas()['plant']
                .tolist()
         )

len(plants)

                                                                                

513

### 1

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [5]:
start_date = datetime.date(2021, 11, 21)
end_date = datetime.date(2022, 2, 20)

In [25]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [26]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [30]:
checks_headers_tc52 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [27]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')
clients_info2 = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info2 = clients_info2.join(slip, on='customer_rk', how='inner')

In [28]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [31]:
checks_headers_tc52 = checks_headers_tc52.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [32]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               , 'base_qty'
                               )
                   )


In [33]:
plu_hierarchy_lvl_4_dk = ['FR0609002']
# syntethic_category_id = [53]
# plu_brand_code = ['2452']
# plu_vendor_nm = ['Данон']
# plu_not_in = [4138521]

In [34]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
                 .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
#                  .filter(F.col('plu_vendor_nm').isin(plu_vendor_nm))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [35]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [36]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [37]:
checks_tc52 = checks_items.join(checks_headers_tc52, 'rtl_txn_id', how='inner')

In [38]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') >= 1)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [39]:
pdf2 = checks_tc52.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf2 = pdf2.filter(F.col('base_qty_per_period') >= 1)
checks_tc52 = checks_tc52.join(pdf2, on=accs)

In [40]:
seg1 = (checks_tc5.withColumn('group', F.lit('sms'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [41]:
seg2 = (checks_tc52.withColumn('group', F.lit('slip'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [42]:
seg = seg1.union(seg2).distinct()

In [43]:
seg.write.parquet('temp2732_seg1', mode='overwrite')
seg = spark.read.parquet('temp2732_seg1')

                                                                                00]0]]]]]

In [44]:
seg_pd1 = seg.toPandas()

                                                                                

In [45]:
seg_pd1.head()

Unnamed: 0,customer_rk,group
0,13589592,sms
1,68238125,sms
2,44794047,sms
3,13683500,sms
4,18675687,sms


In [46]:
seg_no = 'seg1'

In [47]:
seg_pd1.to_csv(name + '_' + seg_no + '_All_uc' + str(dt) + '.csv', index=False)

In [48]:
seg_pd1['customer_rk'] = seg_pd1['customer_rk'].astype('str')

In [49]:
seg_sms_pd = seg_pd1[seg_pd1.group == 'sms']
seg_slip_pd = seg_pd1[seg_pd1.group == 'slip']

In [50]:
cross, upgrade, ump = get_usecases(seg_sms_pd)

In [51]:
print('cross: {}, upgrade: {}, ump: {} (all: {})'.format(len(cross), len(upgrade), len(ump), len(cross)+len(upgrade)+len(ump)))

cross: 22719, upgrade: 36738, ump: 3968 (all: 63425)


In [52]:
cross, upgrade, ump = get_usecases(seg_slip_pd)

In [53]:
print('cross: {}, upgrade: {}, ump: {} (all: {})'.format(len(cross), len(upgrade), len(ump), len(cross)+len(upgrade)+len(ump)))

cross: 51673, upgrade: 56825, ump: 22812 (all: 131310)


### 2

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [54]:
start_date = datetime.date(2021, 11, 21)
end_date = datetime.date(2022, 2, 20)

In [55]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [56]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [57]:
checks_headers_tc52 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [58]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')
clients_info2 = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info2 = clients_info2.join(slip, on='customer_rk', how='inner')

In [59]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [60]:
checks_headers_tc52 = checks_headers_tc52.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [61]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               , 'base_qty'
                               )
                   )


In [62]:
plu_codes_cat = [3952832, 2083865, 3363986, 3445976, 3956037, 2083868, 3614200, 3365419, 3472481, 2083867, 
                 3445980, 3392438, 3472480, 3445978, 3363988, 2083870, 3956036, 3956035]

In [63]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [64]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [65]:
checks_tc52 = checks_items.join(checks_headers_tc52, 'rtl_txn_id', how='inner')

In [66]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') >= 1)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [67]:
pdf2 = checks_tc52.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf2 = pdf2.filter(F.col('base_qty_per_period') >= 1)
checks_tc52 = checks_tc52.join(pdf2, on=accs)

In [68]:
seg1 = (checks_tc5.withColumn('group', F.lit('sms'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [69]:
seg2 = (checks_tc52.withColumn('group', F.lit('slip'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [70]:
seg = seg1.union(seg2).distinct()

In [None]:
seg.write.parquet('temp2732_seg2', mode='overwrite')
seg = spark.read.parquet('temp2732_seg2')

In [72]:
seg_pd2 = seg.toPandas()

                                                                                

In [73]:
seg_pd2.head()

Unnamed: 0,customer_rk,group
0,80969743,sms
1,15845727,sms
2,105367445,sms
3,13396596,sms
4,55407176,sms


In [74]:
seg_no = 'seg2'

In [75]:
seg_pd2.to_csv(name + '_' + seg_no + '_All_uc' + str(dt) + '.csv', index=False)

In [76]:
seg_pd2['customer_rk'] = seg_pd2['customer_rk'].astype('str')

In [77]:
seg_sms_pd = seg_pd2[seg_pd2.group == 'sms']
seg_slip_pd = seg_pd2[seg_pd2.group == 'slip']

In [78]:
cross, upgrade, ump = get_usecases(seg_sms_pd)

In [79]:
print('cross: {}, upgrade: {}, ump: {} (all: {})'.format(len(cross), len(upgrade), len(ump), len(cross)+len(upgrade)+len(ump)))

cross: 8526, upgrade: 13934, ump: 1572 (all: 24032)


In [80]:
cross, upgrade, ump = get_usecases(seg_slip_pd)

In [81]:
print('cross: {}, upgrade: {}, ump: {} (all: {})'.format(len(cross), len(upgrade), len(ump), len(cross)+len(upgrade)+len(ump)))

cross: 19629, upgrade: 21584, ump: 8831 (all: 50044)


### 3

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [82]:
start_date = datetime.date(2021, 11, 21)
end_date = datetime.date(2022, 2, 20)

In [83]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [84]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [85]:
checks_headers_tc52 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [86]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')
clients_info2 = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info2 = clients_info2.join(slip, on='customer_rk', how='inner')

In [87]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [88]:
checks_headers_tc52 = checks_headers_tc52.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [89]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               , 'base_qty'
                               )
                   )


In [90]:
# plu_hierarchy_lvl_4_dk = ['FR0609002']
syntethic_category_id = [51, 52]
# plu_brand_code = ['2452']
# plu_vendor_nm = ['Данон']
# plu_not_in = [4138521]

In [91]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
#                  .filter(F.col('plu_vendor_nm').isin(plu_vendor_nm))
                 .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [92]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [93]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [94]:
checks_tc52 = checks_items.join(checks_headers_tc52, 'rtl_txn_id', how='inner')

In [95]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') >= 1)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [96]:
pdf2 = checks_tc52.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf2 = pdf2.filter(F.col('base_qty_per_period') >= 1)
checks_tc52 = checks_tc52.join(pdf2, on=accs)

In [97]:
seg1 = (checks_tc5.withColumn('group', F.lit('sms'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [98]:
seg2 = (checks_tc52.withColumn('group', F.lit('slip'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [99]:
seg = seg1.union(seg2).distinct()

In [100]:
seg.write.parquet('temp2732_seg3', mode='overwrite')
seg = spark.read.parquet('temp2732_seg3')

                                                                                 200]0]]

In [101]:
seg_pd3 = seg.toPandas()

                                                                                

In [102]:
seg_pd3.head()

Unnamed: 0,customer_rk,group
0,82181875,sms
1,28822048,sms
2,60517045,sms
3,65871515,sms
4,97316988,sms


In [103]:
seg_no = 'seg3'

In [104]:
seg_pd3.to_csv(name + '_' + seg_no + '_All_uc' + str(dt) + '.csv', index=False)

In [105]:
seg_pd3['customer_rk'] = seg_pd3['customer_rk'].astype('str')

In [106]:
seg_sms_pd = seg_pd3[seg_pd3.group == 'sms']
seg_slip_pd = seg_pd3[seg_pd3.group == 'slip']

In [107]:
cross, upgrade, ump = get_usecases(seg_sms_pd)

In [108]:
print('cross: {}, upgrade: {}, ump: {} (all: {})'.format(len(cross), len(upgrade), len(ump), len(cross)+len(upgrade)+len(ump)))

cross: 53293, upgrade: 83361, ump: 11111 (all: 147765)


In [109]:
cross, upgrade, ump = get_usecases(seg_slip_pd)

In [110]:
print('cross: {}, upgrade: {}, ump: {} (all: {})'.format(len(cross), len(upgrade), len(ump), len(cross)+len(upgrade)+len(ump)))

cross: 118822, upgrade: 130057, ump: 53407 (all: 302286)


In [6]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .withColumn('fm', (F.col('zsalnovat') - F.col('zcst_n')) / F.col('base_qty'))
                    .withColumn('cost_vat', F.col('zsale_vat') / F.col('base_qty'))
                    .withColumn('cost_novat', F.col('zsalnovat') /F.col('base_qty'))
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'fm'
                                , 'cost_vat'
                                , 'cost_novat'
                               )
                   )


In [7]:
plu = [4192988, 4192987, 4192986]

In [8]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu)) # только чеки с нужными plu

In [9]:
checks_items_agg = (checks_items.groupby('plu_code')
                    .agg(F.avg('fm').alias('fm'),
                         F.avg('cost_vat').alias('cost_vat'),
                         F.avg('cost_novat').alias('cost_novat'))
                   )

In [10]:
checks_items_pd = checks_items_agg.toPandas()

                                                                                

In [15]:
checks_items_pd.iloc[:, 1:4] = checks_items_pd.iloc[:, 1:4].astype('float')

In [18]:
checks_items_pd.to_excel('fm_cost.xlsx', index=False)