https://jira.x5.ru/browse/CVMXC-2398

In [2]:
name = 'CVMXC-2398_danone'

In [3]:
import os
import re
from pyspark.sql import SparkSession
from typing import List, Dict, Callable
import socket

spark = None

EXECUTOR_ENV = 'hdfs:///share/products/cvm5/lib/python/anaconda_2.4.4_ds.tar.gz'  # 2.4.4 
SPARK_ARCHIVE = 'hdfs:///share/lib/spark/sparkjars-2.4.4.zip'                     # 2.4.4
#EXECUTOR_ENV = 'hdfs:///share/lib/python/env/anaconda-2019.07.tar.gz'
#SPARK_ARCHIVE = 'hdfs:///share/lib/spark/sparkjars-2.3.1.zip'

os.environ["ARROW_LIBHDFS_DIR"] = "/usr/hdp/2.6.5.0-292/usr/lib"
os.environ['HADOOP_HOME'] = '/usr/hdp/current/hadoop-client/'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64/'
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf/'
os.environ['SPARK_HOME'] = '/opt/conda/lib/python3.7/site-packages/pyspark'
os.environ['PYSPARK_PYTHON'] = 'anaconda_2.4.4_ds.tar.gz/bin/python3'             # 2.4.4
#os.environ['PYSPARK_PYTHON'] = 'anaconda-2019.07.tar.gz/bin/python3'






def restart_spark(task_name: str, num_executors: int, executor_memory='4G', executor_cores=2,
                  driver_memory='2G', queue='cvm5-rnd', additional_params: Dict[str, str] = None):
    global spark

    if spark:
        sc = spark.sparkContext
        if sc and sc._jsc:
            if not sc._jsc.sc().isStopped():
                print('Using cached spark')
                return sc, spark

    need_ports_for_app = 3
    user_tcp_ports = _get_user_tcp_ports()
    free_ports = _get_free_ports(user_tcp_ports)
    assert len(free_ports) >= need_ports_for_app, \
        f"Not enough free ports ({len(free_ports)}), need {need_ports_for_app}, stop other apps"
    app_ports = free_ports[:need_ports_for_app]

    host_ip = os.getenv('HOST_IP')
    
    spark_session = (
        SparkSession
        .builder
        .appName(task_name)
        .master('yarn')
        .config('spark.driver.memory', driver_memory)
        .config('spark.driver.maxResultSize', driver_memory)
        .config('spark.executor.cores', executor_cores)
        .config('spark.executor.memory', executor_memory)
        .config('spark.executor.memoryOverhead', '1G')
        .config('spark.dynamicAllocation.enabled', 'true')
        .config('spark.dynamicAllocation.maxExecutors', num_executors)
        .config('spark.sql.broadcastTimeout', '36000')
        .config('spark.dynamicAllocation.cachedExecutorIdleTimeout', '1200s')
        .config('spark.ui.port', app_ports[0])
        .config('spark.blockManager.port', app_ports[1])
        .config('spark.driver.port', app_ports[2])
        .config('spark.driver.host', host_ip)
        .config('spark.driver.bindAddress', '0.0.0.0')
        .config('spark.driver.extraLibraryPath', '/usr/hdp/2.6.5.0-292/hadoop/lib/native')
        .config('spark.driver.extraJavaOptions', '-Dhdp.version=current')
        .config('spark.debug.maxToStringFields', '50')
        .config('spark.yarn.queue', queue)
        .config('spark.yarn.dist.archives', EXECUTOR_ENV)
        .config('spark.yarn.archive', SPARK_ARCHIVE)
        .config('spark.yarn.am.extraJavaOptions', '-Dhdp.version=current')
        .config('spark.rpc.message.maxSize', '1024')
        .config('spark.sql.warehouse.dir', '/apps/hive/warehouse')
        .config('spark.sql.execution.pandas.respectSessionTimeZone', 'false')
        .config('spark.sql.orc.filterPushdown', 'true')
        .config('spark.sql.hive.convertMetastoreOrc', 'true')
        .config('spark.shuffle.service.enabled', 'true')
        .config('spark.hadoop.yarn.timeline-service.enabled', 'false')
        .config('spark.hadoop.yarn.client.failover-proxy-provider',
                'org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider')
        .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
        .config('spark.kryoserializer.buffer.max', '128m')
        .config('spark.executor.extraLibraryPath', '/usr/hdp/2.6.5.0-292/hadoop/lib/native')
    )

    if additional_params:
        for key, value in additional_params.items():
            spark_session = spark_session.config(key, value)

    spark = (
        spark_session
        .enableHiveSupport()
        .getOrCreate()
    )
    sc = spark.sparkContext

    return sc, spark


def _get_user_tcp_ports() -> List[str]:
    regexp = re.compile(r'-2e')
    envuser= os.getenv('HOSTNAME')
    if regexp.search(envuser):       
      _, user_name, user_surname = envuser.upper().split('-')
      user_full_name = '_'.join([user_name, user_surname])
    else:
      _, user_name  = envuser.upper().split('-') 
      user_full_name = user_name
    user_tcp_ports = [v for k, v in os.environ.items() if user_full_name in k and k.endswith('TCP_PORT')]
    return user_tcp_ports


def _get_free_ports(ports: List[str]):
    free_ports = []
    for port in ports:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            if s.connect_ex(('0.0.0.0', int(port))) != 0:
                free_ports.append(port)
    return free_ports


In [4]:
sc, spark = restart_spark(
                    name, 
                    21, 
                    executor_memory='5G', 
                    executor_cores=3, 
                    driver_memory='7G', 
                    additional_params={"spark.sql.shuffle.partitions": "300"})

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/28 13:07:44 WARN cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!


In [5]:
sc.setLogLevel('ERROR')

In [6]:
import pandas as pd
import sys
from pyspark.sql import functions as F
import datetime
from datetime import timedelta

sys.path.append('/home/jovyan/glow-byte-filters-pyspark')
from logic_filters import * 
from segmentation import *

In [7]:
LOYALTY_CARDS = "hive_ssa_tc5.loyalty_card"
LOYALTY_CARDHOLDERS = "hive_ssa_tc5.loyalty_cardholder"
ACCOUNTS = "hive_ssa_tc5.account"
CVM5_GUESTS = "hive_cvm_acrm.cvm5_guests"

DIM_STORE = "hive_ssa_main.dim_store"
CHECKS_HEADERS = "hive_ssa_main.fct_rtl_txn"
CHECKS_ITEMS = "hive_ssa_main.fct_rtl_txn_item"
PRODUCTS = "hive_ssa_tc5.cvm_product"

### Выбираем гостей нужного юзкейса

In [7]:
usecase = ['churn', 'frequency', 'cross', 'upgrade', 'ump']
dt = datetime.date(2021, 12, 22)

In [8]:
customers_usecase = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase))
                     .select('account_no', 'customer_rk')
                            )

In [9]:
customers_usecase.count()

                                                                                

49418569

### Проверяем на доступность отобранных гостей в определенную дату

In [10]:
check_date = '2022-02-14'

In [11]:
seg_sms1 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[0], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:03


In [12]:
seg_sms2 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[1], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:03


In [13]:
seg_sms3 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[2], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:02


In [14]:
seg_sms4 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[3], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:10


In [15]:
seg_sms5 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[4], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:03


In [16]:
seg_sms = seg_sms1.union(seg_sms2).union(seg_sms3).union(seg_sms4).union(seg_sms5).distinct()

In [17]:
# seg_sms.write.parquet('temp2398_1', mode='overwrite')
seg_sms = spark.read.parquet('temp2398_1')

In [18]:
seg_sms.count() #24 145 153

                                                                                

25145153

# seg1

GEO - Msk, UG

In [19]:
macroregion_dk1 = ['MRDCT', 'MRDUG']

In [20]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('macroregion_dk').isin(macroregion_dk1))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

5988

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [21]:
start_date1 = datetime.date(2021, 8, 16)
end_date1 = datetime.date(2021, 10, 15)
start_date2 = datetime.date(2021, 10, 16)
end_date2 = datetime.date(2021, 12, 15)

In [22]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date2, end_date2))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [23]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [24]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [25]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [27]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date2, end_date2))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [28]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [29]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [30]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [31]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [32]:
# те, кто покупал во 2 период даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [33]:
seg1.count()

                                                                                ]]

401940

In [34]:
# те, кто не покупал во 2 период даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [35]:
seg2.count()

                                                                                ]

24743213

In [36]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date1, end_date1))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [37]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [38]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [39]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [40]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date1, end_date1))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [41]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [42]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [43]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [44]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [53]:
# те, кто покупал в 1 период даниссимо
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [54]:
# те, кто не покупал во 2 период даниссимо, хотя раньше покупали
seg = seg3.join(seg2, on='customer_rk', how='inner')

In [55]:
seg3.count()

                                                                                ]]

361237

In [56]:
seg.count()

                                                                                ]]]]

215898

In [57]:
seg.write.parquet('temp2398_seg1', mode='overwrite')
seg = spark.read.parquet('temp2398_seg1')

                                                                                ]]]

In [58]:
seg.count()

                                                                                

215898

In [59]:
seg_pd1 = seg.toPandas()

                                                                                

In [60]:
len(seg_pd1)

215898

### МСК + СПБ перестали покупать даниссимо

In [182]:
federal_subject = ['77', '78']

In [183]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

3606

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [184]:
start_date1 = datetime.date(2021, 8, 16)
end_date1 = datetime.date(2021, 10, 15)
start_date2 = datetime.date(2021, 10, 16)
end_date2 = datetime.date(2021, 12, 15)

In [185]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date2, end_date2))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [186]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [187]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [188]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [189]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date2, end_date2))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [190]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [191]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [192]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [193]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [194]:
# те, кто покупал во 2 период даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [195]:
# те, кто не покупал во 2 период даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [196]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date1, end_date1))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [197]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [198]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [199]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [200]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date1, end_date1))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [201]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [202]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [203]:
# те, кто покупал в 1 период даниссимо
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [204]:
# те, кто не покупал во 2 период даниссимо, хотя раньше покупали
seg_msk_spb = seg3.join(seg2, on='customer_rk', how='inner')

In [205]:
seg_msk_spb.write.parquet('temp2398_msk_spb', mode='overwrite')
seg_msk_spb = spark.read.parquet('temp2398_msk_spb')

                                                                                 4535]

In [206]:
seg_msk_spb.count()

                                                                                

115639

# seg2

GEO - MSK, SPB

In [207]:
federal_subject = ['77', '78']

In [208]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

3606

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [209]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [210]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [211]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [212]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [213]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [214]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [215]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [216]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [217]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [218]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [219]:
# те, кто покупал даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [220]:
# те, кто не покупал даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [221]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [222]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [223]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [224]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [225]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [226]:
plu_codes_cat = [3473939, 3473940, 3366852, 3366853, 4162346, 4050799, 3473938, 3697858]

In [227]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [228]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [229]:
# те, кто покупал савушкина
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [230]:
# те, кто не покупал даниссимо, а покупал савушкина
seg_sav = seg3.join(seg2, on='customer_rk', how='inner')

In [231]:
seg_2 = seg_msk_spb.union(seg_sav).distinct()

In [232]:
seg_2.write.parquet('temp2398_seg2', mode='overwrite')
seg_2 = spark.read.parquet('temp2398_seg2')

                                                                                4730]]

In [233]:
seg_2.count()

                                                                                

152119

In [234]:
seg_pd2 = seg_2.toPandas()

                                                                                

In [235]:
len(seg_pd2)

152119

# seg3

GEO - MSK, SPB

In [236]:
federal_subject = ['77', '78']

In [237]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

3606

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [238]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [239]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [240]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [241]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [242]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [243]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [244]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [245]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [246]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [247]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [248]:
# те, кто покупал даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [249]:
# те, кто не покупал даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [250]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [251]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [252]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [253]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [254]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [255]:
plu_codes_cat = [3448195, 3358602, 3642557, 3447080, 4102143,3203086,3306122,3686100,3933629,3365809,3642555,
          3203094,3358593,3203083,3203104,3357178,3365804, 3933633, 4037144, 3501809, 3365806, 3448192, 
          3199753, 3203097, 3644387, 3365807, 3642556, 3203090, 3203100, 3501810]

In [256]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [257]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [258]:
# те, кто покупал чудо
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [259]:
# те, кто не покупал даниссимо, а покупал чудо
seg_chudo = seg3.join(seg2, on='customer_rk', how='inner')

In [260]:
seg_3 = seg_msk_spb.union(seg_chudo).distinct()

In [261]:
seg_3.write.parquet('temp2398_seg3', mode='overwrite')
seg_3 = spark.read.parquet('temp2398_seg3')

                                                                                4730]]

In [262]:
seg_3.count()

                                                                                

180028

In [263]:
seg_pd3 = seg_3.toPandas()

                                                                                

In [264]:
len(seg_pd3)

180028

# seg4

GEO - MSK, SPB

In [265]:
federal_subject = ['77', '78']

In [266]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

3606

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [267]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [268]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [269]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [270]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [271]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [272]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [273]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [274]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [275]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [276]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [277]:
# те, кто покупал даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [278]:
# те, кто не покупал даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [279]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [280]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [281]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [282]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [283]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [284]:
plu_codes_cat = [3473939, 3473940, 3366852, 3366853, 4162346, 4050799, 3473938, 3697858]

In [285]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [286]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [287]:
# те, кто покупал савушкина
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [288]:
# те, кто не покупал даниссимо, а покупал савушкина
seg_4 = seg3.join(seg2, on='customer_rk', how='inner')

In [289]:
seg_4.write.parquet('temp2398_seg4', mode='overwrite')
seg_4 = spark.read.parquet('temp2398_seg4')

                                                                                730]]

In [290]:
seg_4.count()

                                                                                

40503

In [291]:
seg_pd4 = seg_4.toPandas()

                                                                                

In [292]:
len(seg_pd4)

40503

# seg5

GEO - MSK, SPB

In [293]:
federal_subject = ['77', '78']

In [294]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

3606

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [295]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [296]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [297]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [298]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [299]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [300]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [301]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [302]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [303]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [304]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [305]:
# те, кто покупал даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [306]:
# те, кто не покупал даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [307]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [308]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [309]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [310]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [311]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [312]:
plu_codes_cat = [3448195, 3358602, 3642557, 3447080, 4102143,3203086,3306122,3686100,3933629,3365809,3642555,
          3203094,3358593,3203083,3203104,3357178,3365804, 3933633, 4037144, 3501809, 3365806, 3448192, 
          3199753, 3203097, 3644387, 3365807, 3642556, 3203090, 3203100, 3501810]

In [313]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [314]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [315]:
# те, кто покупал чудо
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [316]:
# те, кто не покупал даниссимо, а покупал чудо
seg_5 = seg3.join(seg2, on='customer_rk', how='inner')

In [317]:
seg_5.write.parquet('temp2398_seg5', mode='overwrite')
seg_5 = spark.read.parquet('temp2398_seg5')

                                                                                730]]

In [318]:
seg_5.count()

                                                                                

72909

In [319]:
seg_pd5 = seg_5.toPandas()

                                                                                

In [320]:
len(seg_pd5)

72909

# seg6

GEO - MSK, SPB

In [19]:
federal_subject = ['77']

In [20]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

2768

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [21]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [22]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [23]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [24]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [25]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [26]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [27]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [28]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [38]:
plu_codes_cat = [97452, 2093082]

In [39]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [40]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [41]:
# те, кто покупал ПРО
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [42]:
# те, кто не покупал ПРО
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [43]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [44]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [45]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [46]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [47]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [48]:
plu_codes_cat = [3467794,3608571,78743379006,3329976,3331187,4020,3164213,38955,3196224,3484404,75848,2062509,2144365,
3196225,2069789,3441820,3478969,3369645,3954,3444447,3391712,3444371,3467793,3196226,2135099,3331188,
3329975,3484403,3441821,75845,3408240,3354712,3004333,18997,3467795]


In [49]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [50]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [51]:
# те, кто покупал домик в деревне
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [53]:
# те, кто не покупал ПРО, а покупал домик
seg_6 = seg3.join(seg2, on='customer_rk', how='inner')

In [54]:
seg_6.write.parquet('temp2398_seg6', mode='overwrite')
seg_6 = spark.read.parquet('temp2398_seg6')

                                                                                ]]

In [55]:
seg_6.count()

                                                                                

85524

In [56]:
seg_pd6 = seg_6.toPandas()

                                                                                

In [57]:
len(seg_pd6)

85524

# seg7

GEO - Msk

In [321]:
federal_subject = ['77']

In [322]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('federal_subject_dk').isin(federal_subject))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

2768

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [323]:
start_date1 = datetime.date(2021, 8, 16)
end_date1 = datetime.date(2021, 10, 15)
start_date2 = datetime.date(2021, 10, 16)
end_date2 = datetime.date(2021, 12, 15)

In [324]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date2, end_date2))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [325]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [326]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [327]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [328]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date2, end_date2))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [329]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [330]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [331]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [332]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [333]:
# те, кто покупал во 2 период даниссимо
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [334]:
# те, кто не покупал во 2 период даниссимо
seg2 = seg_sms.join(seg1, on='customer_rk', how='left_anti')

In [335]:
checks_headers2 = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date1, end_date1))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [336]:
checks_headers_tc5_2 = checks_headers2.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [337]:
loyalty_cards2 = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders2 = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info2 = loyalty_cards2.join(loyalty_cardholders2, on='account_no', how='inner')
clients_info2 = clients_info2.join(seg_sms, on='customer_rk', how='inner')

In [338]:
checks_headers_tc5_2 = checks_headers_tc5_2.join(clients_info2, on='loyalty_card_no') #оставили чеки только нужных гостей

In [339]:
checks_items2 = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date1, end_date1))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [340]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
# syntethic_category_id = [51,26,52]
plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [341]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
                 .filter(F.col('plu_brand_code').isin(plu_brand_code))
#                  .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [342]:
checks_items2 = checks_items2.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [343]:
checks_tc5_2 = checks_items2.join(checks_headers_tc5_2, 'rtl_txn_id', how='inner')

In [344]:
# те, кто покупал в 1 период даниссимо
seg3 = (checks_tc5_2.select('customer_rk')
                  .distinct()
      )

In [345]:
# те, кто не покупал во 2 период даниссимо, хотя раньше покупали
seg_7 = seg3.join(seg2, on='customer_rk', how='inner')

In [346]:
seg_7.write.parquet('temp2398_seg7', mode='overwrite')
seg_7 = spark.read.parquet('temp2398_seg7')

                                                                                4730]]

In [347]:
seg_7.count()

                                                                                

71113

In [349]:
seg_pd7 = seg_7.toPandas()

                                                                                

In [350]:
len(seg_pd7)

71113

### Считаем сколько человек в каждом регионе - Москва, Санкт-Петербург, ЮФО

In [12]:
favorite_store = spark.sql('''
with last_purch_tab as (
select l.loyalty_cardholder_acrm_id as customer_rk, max(checks.rtl_txn_start_dttm) as last_purch
from hive_ssa_main.fct_rtl_txn checks
join (select ch.loyalty_cardholder_acrm_id, c.loyalty_card_id from hive_ssa_tc5.loyalty_card c left join hive_ssa_tc5.loyalty_cardholder ch
on c.loyalty_account_id = ch.loyalty_account_id) l ON checks.loyalty_card_no = l.loyalty_card_id
where checks.financial_unit_format_dk ='D' and checks.rtl_txn_cancel_flg = 0
group by l.loyalty_cardholder_acrm_id
),
count_purch_tab as (
select l.loyalty_cardholder_acrm_id as customer_rk, checks.store_id, count(checks.rtl_txn_id) as count_purch,
max(checks.rtl_txn_start_dttm) as last_purch_dt
from hive_ssa_main.fct_rtl_txn checks
join (select ch.loyalty_cardholder_acrm_id, c.loyalty_card_id from hive_ssa_tc5.loyalty_card c
    left join hive_ssa_tc5.loyalty_cardholder ch
    on c.loyalty_account_id = ch.loyalty_account_id) l ON checks.loyalty_card_no = l.loyalty_card_id
join last_purch_tab lpt on lpt.customer_rk = l.loyalty_cardholder_acrm_id
where checks.financial_unit_format_dk ='D' and checks.rtl_txn_cancel_flg = 0 and
datediff(lpt.last_purch, checks.rtl_txn_dt) between 0 and 30
group by l.loyalty_cardholder_acrm_id, checks.store_id
),
max_purch_tab as (select customer_rk, max(count_purch) as max_count_purch, max(last_purch_dt) as last_purch
from count_purch_tab
group by customer_rk
)
select mpt.customer_rk, cpt.store_id
from max_purch_tab mpt
join count_purch_tab cpt on mpt.customer_rk = cpt.customer_rk and mpt.max_count_purch = cpt.count_purch
and mpt.last_purch = cpt.last_purch_dt
''')

In [13]:
active_users = seg_sms.join(favorite_store, 'customer_rk', 'inner')

In [14]:
stores_spb = (
    spark.table(DIM_STORE)
         .filter(F.col('federal_subject_dk').isin(['78']))
         .withColumn('federal_subject', F.lit('spb'))
         .select('store_id', 'federal_subject')
)
stores_msk = (
    spark.table(DIM_STORE)
         .filter(F.col('federal_subject_dk').isin(['77']))
         .withColumn('federal_subject', F.lit('msk'))
         .select('store_id', 'federal_subject')
)
stores_ug = (
    spark.table(DIM_STORE)
         .filter(F.col('macroregion_dk').isin(['MRDUG']))
         .withColumn('federal_subject', F.lit('ug'))
         .select('store_id', 'federal_subject')
)

In [15]:
stores = stores_spb.union(stores_msk).union(stores_ug)

In [16]:
count_people = (
      active_users.join(stores, 'store_id', 'inner')
                  .groupby('federal_subject').agg(F.countDistinct('customer_rk').alias('qty_customers'))
)

In [17]:
count_people_pd = count_people.toPandas()

                                                                                4454]]4]]]

In [None]:
count_people_pd.to_csv('count_people.csv', index=False)

In [180]:
count_people_pd = pd.read_csv('count_people.csv')

In [371]:
count_people_pd

Unnamed: 0,federal_subject,qty_customers
0,ug,2847095
1,msk,1810295
2,spb,1014377


In [365]:
print('1 сегмент: {}, 2 сегмент: {}, 3 сегмент: {}, 4 сегмент: {}, 5 сегмент: {}, 7 сегмент: {}'.format(
    round(len(seg_pd1) / (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'ug'].iloc[0, 1]) * 100, 3),
round(len(seg_pd2) / (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]) * 100, 3),
round(len(seg_pd3) / (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]) * 100, 3),
round(len(seg_pd4) / (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]) * 100, 3),
round(len(seg_pd5) / (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]) * 100, 3),
round(len(seg_pd7) / count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] * 100, 3)))

1 сегмент: 4.636, 2 сегмент: 5.385, 3 сегмент: 6.373, 4 сегмент: 1.434, 5 сегмент: 2.581, 7 сегмент: 3.928


In [369]:
print('1 сегмент: {}-{}, 2 сегмент: {}-{}, 3 сегмент: {}-{}, 4 сегмент: {}-{}, 5 сегмент: {}-{}, 7 сегмент: {}-{}'.format(
    len(seg_pd1), (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'ug'].iloc[0, 1]),
len(seg_pd2), (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]),
len(seg_pd3), (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]),
len(seg_pd4), (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]),
len(seg_pd5), (count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1] + 
             count_people_pd[count_people_pd['federal_subject'] == 'spb'].iloc[0, 1]),
len(seg_pd7), count_people_pd[count_people_pd['federal_subject'] == 'msk'].iloc[0, 1]))

1 сегмент: 215898-4657390, 2 сегмент: 152119-2824672, 3 сегмент: 180028-2824672, 4 сегмент: 40503-2824672, 5 сегмент: 72909-2824672, 7 сегмент: 71113-1810295


#### Расчет общего покрытия гостей кисломолочки, молочки и йогуртов.

In [7]:
usecase = ['cross', 'upgrade', 'ump']
dt = datetime.date(2021, 12, 22)

In [8]:
customers_usecase = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase))
                     .select('account_no', 'customer_rk')
                            )

In [9]:
customers_usecase.count()

                                                                                

16005567

### Проверяем на доступность отобранных гостей в определенную дату

In [10]:
check_date = '2022-02-14'

In [11]:
seg_sms1 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[0], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:02


In [12]:
seg_sms2 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[1], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:02


In [13]:
seg_sms3 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[2], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:02


In [14]:
seg_sms = seg_sms1.union(seg_sms2).union(seg_sms3).distinct()

21/12/27 11:03:16 ERROR cluster.YarnScheduler: Lost executor 4 on mn-hd0018.x5.ru: Unable to create executor due to Unable to register with external shuffle server due to : java.util.concurrent.TimeoutException: Timeout waiting for task.


In [15]:
seg_sms.write.parquet('temp2398_rf', mode='overwrite')
seg_sms = spark.read.parquet('temp2398_rf')

[Stage 20:=>(28 + 3) / 31][Stage 32:(273 + 26) / 1441][Stage 34:(85 + 1) / 1441]21/12/27 11:09:16 ERROR cluster.YarnScheduler: Lost executor 14 on mn-hd0299.x5.ru: Unable to create executor due to Unable to register with external shuffle server due to : java.util.concurrent.TimeoutException: Timeout waiting for task.
[Stage 32:(1439 + 2) / 1441][Stage 34:(633 + 37) / 1441][Stage 36:(224 + 0) / 1441]21/12/27 11:10:42 ERROR cluster.YarnScheduler: Lost executor 25 on mn-hd0274.x5.ru: Unable to create executor due to Unable to register with external shuffle server due to : java.util.concurrent.TimeoutException: Timeout waiting for task.
21/12/27 11:13:10 ERROR client.TransportClient: Failed to send RPC RPC 6061844798741078407 to /192.168.234.53:41250: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(...)(Unknown Source)
21/12/27 11:13:10 ERROR client.TransportClient: Failed to send RPC RPC 7288900611

In [16]:
seg_sms.count()

                                                                                

7093877

# seg1

GEO - RF

In [17]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
#             .filter(F.col('macroregion_dk').isin(macroregion_dk1))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

26787

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [18]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [19]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [20]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [21]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [22]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [23]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [24]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
syntethic_category_id = [52,79,26,28,51]
# plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [25]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
#                  .filter(F.col('plu_brand_code').isin(plu_brand_code))
                 .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [26]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [27]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [28]:
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [29]:
seg1.write.parquet('temp2398_seg_kils', mode='overwrite')
seg1 = spark.read.parquet('temp2398_seg_kils')

                                                                                ]

In [30]:
seg1.count()

                                                                                

6552276

In [None]:
seg_pd1 = seg1.toPandas()

In [None]:
len(seg_pd1)

#### Расчет общего покрытия гостей йогуртов.

In [8]:
usecase = ['cross', 'upgrade', 'ump']
dt = datetime.date(2021, 12, 22)

In [9]:
customers_usecase = (spark
                     .table(CVM5_GUESTS)
                     .filter(F.to_date('calculation_dt') == dt)
                     .filter(F.col('usecase').isin(usecase))
                     .select('account_no', 'customer_rk')
                            )

In [10]:
customers_usecase.count()

                                                                                

16005567

### Проверяем на доступность отобранных гостей в определенную дату

In [11]:
check_date = '2022-02-14'

In [21]:
seg_sms1 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[0], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:13


In [22]:
seg_sms2 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[1], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:39


In [23]:
seg_sms3 = (sms_channel_filters_glowbyte(spark=spark,
                                         guests=customers_usecase, 
                                         usecase_name=usecase[2], 
                                         check_date=check_date, 
                                         debug_mode=False)
                                    .select('customer_rk')
                                    .distinct()
           )

[93m Время выполнения: 0:00:10


In [24]:
seg_sms = seg_sms1.union(seg_sms2).union(seg_sms3).distinct()

In [25]:
seg_sms.write.parquet('temp2398_rf', mode='overwrite')
seg_sms = spark.read.parquet('temp2398_rf')

                                                                                1]]

In [26]:
seg_sms.count()

                                                                                

7614473

# seg1

GEO - MSK, SPB LO

In [27]:
macroregion_dk = ['MRDCT', 'MRSNZ']

In [28]:
plants = (spark.table(DIM_STORE)
            .filter(F.col('valid_to_dttm')== datetime.datetime(5999, 1, 1, 0, 0))
            .filter(F.col('macroregion_dk').isin(macroregion_dk))
            .selectExpr('store_id as plant')
            .distinct()
            .toPandas()['plant']
            .tolist()
            )

len(plants)

                                                                                

4449

### Собираем чеки этих гостей и оставляем тех, кто покупал продукты с synthetic_catalog_id

In [29]:
start_date = datetime.date(2021, 10, 16)
end_date = datetime.date(2021, 12, 15)

In [30]:
checks_headers = (spark.table(CHECKS_HEADERS)
                      .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                      .filter((F.col('loyalty_card_no') != '') & (F.col('loyalty_card_no').isNotNull()))
                      .filter(F.col('financial_unit_format_dk') == 'D')
                      .filter(F.col('rtl_txn_cancel_flg') == 0)
                      .select('rtl_txn_id', 'loyalty_card_no', 'store_id')
                     )


In [31]:
checks_headers_tc5 = checks_headers.filter(F.col('store_id').isin(plants)) #оставили чеки только с нужными магазинами

In [32]:
loyalty_cards = (spark
                    .table(LOYALTY_CARDS)
                    .withColumnRenamed("loyalty_card_id", "loyalty_card_no")
                    .withColumnRenamed("loyalty_account_id", "account_no")
                    .withColumnRenamed("loyalty_account_acrm_id", "account_rk")
                    .select('account_no', 'loyalty_card_no')
                )
loyalty_cardholders = (spark
                        .table(LOYALTY_CARDHOLDERS)
                        .withColumnRenamed("loyalty_cardholder_acrm_id", "customer_rk")
                        .withColumnRenamed("loyalty_account_id", "account_no")
                        .select('account_no', 'customer_rk')
                      )
clients_info = loyalty_cards.join(loyalty_cardholders, on='account_no', how='inner')
clients_info = clients_info.join(seg_sms, on='customer_rk', how='inner')

In [33]:
checks_headers_tc5 = checks_headers_tc5.join(clients_info, on='loyalty_card_no') #оставили чеки только нужных гостей

In [34]:
checks_items = (spark.table(CHECKS_ITEMS) 
                    .withColumnRenamed('plu_id', 'plu_code')
                    .withColumnRenamed('turnover_no_vat_amt', 'zsalnovat')
                    .withColumnRenamed('turnover_vat_rub_amt', 'zsale_vat')
                    .withColumnRenamed('prime_cost_no_vat_amt', 'zcst_n')
                    .withColumnRenamed('turnover_base_uom_amt', 'base_qty')
                    .withColumnRenamed('discount_amt', 'zdiscount')
                    .withColumnRenamed('fact_regular_promo_flg', 'zpromofl')
                    .filter(F.col('rtl_txn_dt').between(start_date, end_date))
                    .filter((F.col('zsalnovat') >= 0) & (F.col('base_qty') >= 0) & (F.col('zcst_n') > 0)) #keep only correct data
                        .select('plu_code' #items id
                                , 'rtl_txn_id' #cheques id
                               )
                   )


In [35]:
# plu_hierarchy_lvl_4_dk = ['FR0606003', 'FR0606009']
syntethic_category_id = [51]
# plu_brand_code = ['5725', 'M082']
# plu_not_in = [4138521]

In [36]:
plu_codes_cat = (spark
                 .table(PRODUCTS)
#                  .filter(F.col('plu_hierarchy_lvl_4_dk').isin(plu_hierarchy_lvl_4_dk))
#                  .filter(F.col('plu_brand_code').isin(plu_brand_code))
                 .filter(F.col('syntethic_category_id').isin(syntethic_category_id))
#                  .filter(~F.col('plu_id').isin(plu_not_in))
                 .select('plu_id')
                 .distinct()
                 .toPandas()['plu_id']
                 .tolist()
                )

                                                                                

In [37]:
checks_items = checks_items.filter(F.col('plu_code').isin(plu_codes_cat)) # только чеки с нужными plu

In [38]:
checks_tc5 = checks_items.join(checks_headers_tc5, 'rtl_txn_id', how='inner')

In [39]:
seg1 = (checks_tc5.select('customer_rk')
                  .distinct()
      )

In [40]:
seg1.write.parquet('temp2398_seg_yog', mode='overwrite')
seg1 = spark.read.parquet('temp2398_seg_yog')

                                                                                ]

In [41]:
seg1.count()

                                                                                

833530

21/12/29 02:54:48 ERROR cluster.YarnClientSchedulerBackend: YARN application has exited unexpectedly with state FAILED! Check the YARN application logs for more details.
21/12/29 02:54:48 ERROR cluster.YarnClientSchedulerBackend: Diagnostics message: Attempt recovered after RM restartDue to executor failures all available nodes are blacklisted
21/12/29 02:54:48 ERROR client.TransportClient: Failed to send RPC RPC 6452304191320500991 to /192.168.234.72:38580: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(...)(Unknown Source)
21/12/29 02:54:48 ERROR cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Sending RequestExecutors(0,0,Map(),Set()) to AM was unsuccessful
java.io.IOException: Failed to send RPC RPC 6452304191320500991 to /192.168.234.72:38580: java.nio.channels.ClosedChannelException
	at org.apache.spark.network.client.TransportClient$RpcChannelListener.handleFailure(TransportClient.jav