https://jira.x5.ru/browse/CVMUPG-1050

In [1]:
name = 'CVMUPG-1050_tabak'

In [2]:
import sys
import pandas as pd
import pyspark.sql.functions as F
import datetime
from datetime import timedelta
sys.path.append('/home/jovyan/glow-byte-filters-pyspark')
from logic_filters import * 
from segmentation import *
from spark import *

%load_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = 500

In [3]:
sc, spark = restart_spark(name, 100, executor_memory="3G", executor_cores=5, driver_memory="12G",
                          additional_params={"spark.sql.shuffle.partitions": "200"})
sc.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
LOYALTY_CARDS = "hive_ssa_tc5.loyalty_card"
LOYALTY_CARDHOLDERS = "hive_ssa_tc5.loyalty_cardholder"
ACCOUNTS = "hive_ssa_tc5.account"
CVM5_GUESTS = "hive_cvm_acrm.cvm5_guests"

DIM_STORE = "hive_ssa_main.dim_store"
CHECKS_HEADERS = "hive_ssa_main.fct_rtl_txn"
CHECKS_ITEMS = "hive_ssa_main.fct_rtl_txn_item"
PRODUCTS = "hive_ssa_tc5.cvm_product"

### Выбираем гостей нужного юзкейса

In [5]:
usecase = ['churn', 'frequency', 'cross', 'upgrade', 'ump']
dt = datetime.date.today()

In [6]:
customers_usecase = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase))
                     .select('account_no', 'customer_rk')
                            )

In [None]:
customers_usecase.count() # 49 523 503

### Для определения сохранения и частоты

In [7]:
usecase1 = ['churn']
usecase2 = ['frequency']

In [8]:
customers_usecase_churn = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase1))
                     .select('customer_rk')
                     .distinct()
                            )

In [9]:
customers_usecase_frequency = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase2))
                     .select('customer_rk')
                     .distinct()
                            )

In [None]:
customers_usecase_churn.count() # 16 031 893

In [None]:
customers_usecase_frequency.count() # 17 518 878

### Проверяем на доступность отобранных гостей в определенную дату

In [None]:
check_date = '2022-03-01'

In [None]:
seg_sms1 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[0], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

In [None]:
seg_sms2 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[1], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

In [None]:
seg_sms3 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[2], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

In [None]:
seg_sms4 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[3], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

In [None]:
seg_sms5 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[4], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

In [None]:
seg_sms = seg_sms1.union(seg_sms2).union(seg_sms3).union(seg_sms4).union(seg_sms5).distinct()

In [10]:
# seg_sms.write.parquet('temp1050_1', mode='overwrite')
seg_sms = spark.read.parquet('temp1050_1')

                                                                                

In [None]:
seg_sms.count() # 27 245 812

### Исследование plu

In [None]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2022, 1, 15)

In [None]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no')
                     )


In [None]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [None]:
checks_headers_tc5 = checks_headers.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [None]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                                , 'base_qty'
                                , 'zsale_vat' #cost
                               )
                   )


In [None]:
plu_hierarchy_lvl_4_dk = ['FD1001001']
# syntethic_category_id = [51,26,52]
# plu_brand_code = ['0989', 'N381', '3422']
# plu_not_in = [4138521]

In [None]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
                 .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
#                  .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

In [None]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [None]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [None]:
items_costs = checks_tc5.groupby('plu_code').agg(F.avg('zsale_vat').alias('cost'), F.sum('base_qty').alias('qty'))

In [None]:
buyed_plu = (checks_tc5.select('plu_code').distinct())

In [None]:
information_plu = (spark.table(PRODUCTS)
                    .withColumnRenamed('plu_id', 'plu_code')
                    .join(buyed_plu, 'plu_code', 'inner')
                    .join(items_costs, 'plu_code', 'inner').toPandas()
                  )

In [None]:
information_plu.to_csv('buyed_plu.csv', index=False)

In [None]:
information_plu.to_excel('buyed_plu.xlsx', index=False)

In [52]:
information_plu = pd.read_csv('buyed_plu.csv')

In [54]:
plu_codes1 = information_plu[information_plu['cost'] < 140]['plu_code'].tolist()

In [55]:
plu_codes2 = information_plu[information_plu['cost'] >= 140]['plu_code'].tolist()

## Группы

In [66]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2022, 1, 15)

### 1 аудитория (< 140 рублей)

In [67]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no')
                     )


In [68]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .filter(F.col('loyalty_cardholder_age_yrs') >= 18)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [69]:
checks_headers_tc5 = checks_headers.join(clients_info, on='loyalty_card_no') #оставили чеки только доступных гостей

In [70]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                                , 'base_qty'
                               )
                   )


In [71]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes1)) # только чеки с нужными plu

In [72]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [73]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') >= 3)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [74]:
seg1 = (checks_tc5.withColumn('group', F.lit('1'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

### 2 аудитория (>= 140 рублей)

In [75]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no')
                     )


In [76]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .filter(F.col('loyalty_cardholder_age_yrs') >= 18)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [77]:
checks_headers_tc52 = checks_headers2.join(clients_info2, on='loyalty_card_no') #оставили чеки только доступных гостей

In [78]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                                , 'base_qty'
                               )
                   )


In [79]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes2)) # только чеки с нужными plu

In [80]:
checks_tc52 = checks_items2.join(checks_headers_tc52, 'rtl_txn_id', how='inner')

In [81]:
pdf2 = checks_tc52.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf2 = pdf2.filter(F.col('base_qty_per_period') >= 3)
checks_tc52 = checks_tc52.join(pdf, on=accs)

In [82]:
seg2 = (checks_tc52.withColumn('group', F.lit('2'))
                  .select('customer_rk', 'group')
                  .distinct()
      )

In [83]:
seg = seg1.union(seg2)

In [84]:
seg.write.parquet('temp1050_seg1', mode='overwrite')
seg = spark.read.parquet('temp1050_seg1')

                                                                                48]]]]]

In [85]:
seg_pd = seg.toPandas()

                                                                                

In [86]:
print(len(seg_pd[seg_pd['group'] == '1']))
print(len(seg_pd[seg_pd['group'] == '2']))
print(len(seg_pd))

1649407
632845
2282252


### 3 аудитория (стики Heets или Fiit)

In [33]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no')
                     )


In [34]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .filter(F.col('loyalty_cardholder_age_yrs') >= 18)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [35]:
checks_headers_tc5 = checks_headers.join(clients_info, on='loyalty_card_no') #оставили чеки только доступных гостей

In [36]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                                , 'base_qty'
                               )
                   )


In [37]:
plu_codes = [3937807,3938504,4077685,3937810,4002855,3662978,3981554,3981553,3981552,3981559,3937806,4043654,4043653,
3981557,3937808,4144430,3937805,4043650,3981550,3663201,4077680,3981560,4126042,3665683,3981551,4002857,4063395,4043651,
3662977,4126044,4077683,3981556,3937811,3938503,3981558]

In [38]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes)) # только чеки с нужными plu

In [39]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [40]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') > 2)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [41]:
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [42]:
seg1.write.parquet('temp1050_seg3', mode='overwrite')
seg1 = spark.read.parquet('temp1050_seg3')

                                                                                48]]]]]

In [43]:
seg1_pd = seg1.toPandas()

                                                                                

### 4 аудитория (стики Heets или Fiit и сигареты)

In [47]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no')
                     )


In [48]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .filter(F.col('loyalty_cardholder_age_yrs') >= 18)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [49]:
checks_headers_tc5 = checks_headers.join(clients_info, on='loyalty_card_no') #оставили чеки только доступных гостей

In [50]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                                , 'base_qty'
                               )
                   )


In [56]:
plu_codes = plu_codes1 + plu_codes2

In [57]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes)) # только чеки с нужными plu

In [58]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [59]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') > 2)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [60]:
seg2 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [61]:
seg2 = seg2.join(seg1, 'customer_rk', 'inner')

In [62]:
seg2.write.parquet('temp1050_seg4', mode='overwrite')
seg2 = spark.read.parquet('temp1050_seg4')

                                                                                ]]]]]]]

In [63]:
seg2_pd = seg2.toPandas()

                                                                                

In [65]:
len(seg2_pd)

136888

## Исключение из групп

In [140]:
set(seg_pd[seg_pd['group'] == '2']['customer_rk']) - set(seg_pd[seg_pd['group'] == '1']['customer_rk'])

set()

In [None]:
seg_pd 
seg1_pd # (seg1)
seg2_pd # (seg2)

In [102]:
seg_smokers1 = seg_pd[seg_pd['group'] == '1']['customer_rk']
seg_smokers2 = seg_pd[seg_pd['group'] == '2']['customer_rk']

In [103]:
seg_smokers1 = seg_smokers1.astype('int')

In [104]:
seg_smokers2 = seg_smokers2.astype('int')

In [108]:
seg1_smoke = pd.DataFrame(list(set(seg_smokers1) - set(seg_smokers2)), columns=['customer_rk'])

In [110]:
seg2_smoke = pd.DataFrame(list(set(seg_smokers2) - set(seg_smokers1)), columns=['customer_rk'])

In [111]:
seg1_pd = seg1_pd.astype('int')
seg2_pd = seg2_pd.astype('int')

In [116]:
seg_stiks = pd.DataFrame(list(set(seg1_pd['customer_rk']) - set(seg2_pd['customer_rk'])), columns=['customer_rk'])

In [117]:
seg_stiki_smoke = seg2_pd

### Определение юз-кейсов

In [133]:
var1 = spark.createDataFrame(seg1_smoke)
var3 = spark.createDataFrame(seg_stiks)
var4 = spark.createDataFrame(seg_stiki_smoke)

In [147]:
churn1_df = customers_usecase_churn.join(var1, 'customer_rk', 'inner')
frequency1_df = customers_usecase_frequency.join(var1, 'customer_rk', 'inner')
others1_df = var1.join(churn1_df, 'customer_rk', 'left_anti').join(frequency1_df, 'customer_rk', 'left_anti')

In [142]:
churn1 = churn1_df.toPandas()

                                                                                

In [143]:
frequency1 = frequency1_df.toPandas()

                                                                                

In [148]:
others1 = others1_df.toPandas()

                                                                                

In [149]:
others1 = others1.astype('str')

In [150]:
cross1, upgrade1, ump1 = get_usecases(others1)

In [152]:
print('churn: {}, frequency: {}, cross: {}, upgrade: {}, ump: {} (all: {})'
      .format(len(churn1), len(frequency1), len(cross1), len(upgrade1), len(ump1), 
              len(churn1)+len(frequency1)+len(cross1)+len(upgrade1)+len(ump1)))

churn: 81221, frequency: 298082, cross: 250797, upgrade: 239302, ump: 147160 (all: 1016562)


In [153]:
churn3_df = customers_usecase_churn.join(var3, 'customer_rk', 'inner')
frequency3_df = customers_usecase_frequency.join(var3, 'customer_rk', 'inner')
others3_df = var3.join(churn3_df, 'customer_rk', 'left_anti').join(frequency3_df, 'customer_rk', 'left_anti')

In [154]:
churn3 = churn3_df.toPandas()

                                                                                

In [155]:
frequency3 = frequency3_df.toPandas()

                                                                                

In [156]:
others3 = others3_df.toPandas()

                                                                                

In [157]:
others3 = others3.astype('str')

In [158]:
cross3, upgrade3, ump3 = get_usecases(others3)

In [159]:
print('churn: {}, frequency: {}, cross: {}, upgrade: {}, ump: {} (all: {})'
      .format(len(churn3), len(frequency3), len(cross3), len(upgrade3), len(ump3), 
              len(churn3)+len(frequency3)+len(cross3)+len(upgrade3)+len(ump3)))

churn: 17925, frequency: 84685, cross: 71027, upgrade: 66419, ump: 41549 (all: 281605)


In [160]:
churn4_df = customers_usecase_churn.join(var4, 'customer_rk', 'inner')
frequency4_df = customers_usecase_frequency.join(var4, 'customer_rk', 'inner')
others4_df = var4.join(churn4_df, 'customer_rk', 'left_anti').join(frequency4_df, 'customer_rk', 'left_anti')

In [161]:
churn4 = churn4_df.toPandas()

                                                                                

In [162]:
frequency4 = frequency4_df.toPandas()

                                                                                

In [163]:
others4 = others4_df.toPandas()

                                                                                

In [164]:
others4 = others4.astype('str')

In [165]:
cross4, upgrade4, ump4 = get_usecases(others4)

In [166]:
print('churn: {}, frequency: {}, cross: {}, upgrade: {}, ump: {} (all: {})'
      .format(len(churn4), len(frequency4), len(cross4), len(upgrade4), len(ump4), 
              len(churn4)+len(frequency4)+len(cross4)+len(upgrade4)+len(ump4)))

churn: 5738, frequency: 17123, cross: 44397, upgrade: 42406, ump: 27224 (all: 136888)


#### Совершеннолетние курильщики. Те, кто покупал сигареты 3 и более раз за последние 3 месяца с применением карты лояльности, доступные в вайбер и смс.

In [11]:
start_date = datetime.date(2021, 11, 3)
end_date = datetime.date(2022, 2, 2)

In [12]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no')
                     )


In [13]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .filter(F.col('loyalty_cardholder_age_yrs') >= 18)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [14]:
checks_headers_tc5 = checks_headers.join(clients_info, on='loyalty_card_no') #оставили чеки только доступных гостей

In [15]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                                , 'base_qty'
                               )
                   )


In [16]:
plu_hierarchy_lvl_4_dk = ['FD1001001']
# syntethic_category_id = [51,26,52]
# plu_brand_code = ['0989', 'N381', '3422']
# plu_not_in = [4138521]

In [17]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
                 .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
#                  .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [18]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [19]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [20]:
accs = ['account_no']
pdf = checks_tc5.groupby(accs).agg(F.sum('base_qty').alias('base_qty_per_period'))
pdf = pdf.filter(F.col('base_qty_per_period') >= 3)
checks_tc5 = checks_tc5.join(pdf, on=accs)

In [21]:
seg = (checks_tc5.select('customer_rk')
                 .distinct()
      )

In [22]:
seg.write.parquet('temp1050_seg', mode='overwrite')
seg = spark.read.parquet('temp1050_seg')

                                                                                095]]]]

In [23]:
seg.count()

                                                                                

2910989

### Определение юз-кейсов

In [24]:
churn1_df = customers_usecase_churn.join(seg, 'customer_rk', 'inner')
frequency1_df = customers_usecase_frequency.join(seg, 'customer_rk', 'inner')
others1_df = seg.join(churn1_df, 'customer_rk', 'left_anti').join(frequency1_df, 'customer_rk', 'left_anti')

In [25]:
churn1 = churn1_df.toPandas()

                                                                                

In [26]:
frequency1 = frequency1_df.toPandas()

                                                                                

In [27]:
others1 = others1_df.toPandas()

                                                                                

In [28]:
others1 = others1.astype('str')

In [29]:
cross1, upgrade1, ump1 = get_usecases(others1)

In [30]:
print('churn: {}, frequency: {}, cross: {}, upgrade: {}, ump: {} (all: {})'
      .format(len(churn1), len(frequency1), len(cross1), len(upgrade1), len(ump1), 
              len(churn1)+len(frequency1)+len(cross1)+len(upgrade1)+len(ump1)))

churn: 220617, frequency: 728059, cross: 768790, upgrade: 738249, ump: 455274 (all: 2910989)
