In [1]:
name = 'tests2'

In [2]:
import os
import re
from pyspark.sql import SparkSession
from typing import List, Dict, Callable
import socket

spark = None

EXECUTOR_ENV = 'hdfs:///share/products/cvm5/lib/python/anaconda_2.4.4_ds.tar.gz'  # 2.4.4 
SPARK_ARCHIVE = 'hdfs:///share/lib/spark/sparkjars-2.4.4.zip'                     # 2.4.4
#EXECUTOR_ENV = 'hdfs:///share/lib/python/env/anaconda-2019.07.tar.gz'
#SPARK_ARCHIVE = 'hdfs:///share/lib/spark/sparkjars-2.3.1.zip'

os.environ["ARROW_LIBHDFS_DIR"] = "/usr/hdp/2.6.5.0-292/usr/lib"
os.environ['HADOOP_HOME'] = '/usr/hdp/current/hadoop-client/'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64/'
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf/'
os.environ['SPARK_HOME'] = '/opt/conda/lib/python3.7/site-packages/pyspark'
os.environ['PYSPARK_PYTHON'] = 'anaconda_2.4.4_ds.tar.gz/bin/python3'             # 2.4.4
#os.environ['PYSPARK_PYTHON'] = 'anaconda-2019.07.tar.gz/bin/python3'






def restart_spark(task_name: str, num_executors: int, executor_memory='4G', executor_cores=2,
                  driver_memory='2G', queue='cvm5-rnd', additional_params: Dict[str, str] = None):
    global spark

    if spark:
        sc = spark.sparkContext
        if sc and sc._jsc:
            if not sc._jsc.sc().isStopped():
                print('Using cached spark')
                return sc, spark

    need_ports_for_app = 3
    user_tcp_ports = _get_user_tcp_ports()
    free_ports = _get_free_ports(user_tcp_ports)
    assert len(free_ports) >= need_ports_for_app, \
        f"Not enough free ports ({len(free_ports)}), need {need_ports_for_app}, stop other apps"
    app_ports = free_ports[:need_ports_for_app]

    host_ip = os.getenv('HOST_IP')
    
    spark_session = (
        SparkSession
        .builder
        .appName(task_name)
        .master('yarn')
        .config('spark.driver.memory', driver_memory)
        .config('spark.driver.maxResultSize', driver_memory)
        .config('spark.executor.cores', executor_cores)
        .config('spark.executor.memory', executor_memory)
        .config('spark.executor.memoryOverhead', '1G')
        .config('spark.dynamicAllocation.enabled', 'true')
        .config('spark.dynamicAllocation.maxExecutors', num_executors)
        .config('spark.sql.broadcastTimeout', '36000')
        .config('spark.dynamicAllocation.cachedExecutorIdleTimeout', '1200s')
        .config('spark.ui.port', app_ports[0])
        .config('spark.blockManager.port', app_ports[1])
        .config('spark.driver.port', app_ports[2])
        .config('spark.driver.host', host_ip)
        .config('spark.driver.bindAddress', '0.0.0.0')
        .config('spark.driver.extraLibraryPath', '/usr/hdp/2.6.5.0-292/hadoop/lib/native')
        .config('spark.driver.extraJavaOptions', '-Dhdp.version=current')
        .config('spark.debug.maxToStringFields', '50')
        .config('spark.yarn.queue', queue)
        .config('spark.yarn.dist.archives', EXECUTOR_ENV)
        .config('spark.yarn.archive', SPARK_ARCHIVE)
        .config('spark.yarn.am.extraJavaOptions', '-Dhdp.version=current')
        .config('spark.rpc.message.maxSize', '1024')
        .config('spark.sql.warehouse.dir', '/apps/hive/warehouse')
        .config('spark.sql.execution.pandas.respectSessionTimeZone', 'false')
        .config('spark.sql.orc.filterPushdown', 'true')
        .config('spark.sql.hive.convertMetastoreOrc', 'true')
        .config('spark.shuffle.service.enabled', 'true')
        .config('spark.hadoop.yarn.timeline-service.enabled', 'false')
        .config('spark.hadoop.yarn.client.failover-proxy-provider',
                'org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider')
        .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
        .config('spark.kryoserializer.buffer.max', '128m')
        .config('spark.executor.extraLibraryPath', '/usr/hdp/2.6.5.0-292/hadoop/lib/native')
    )

    if additional_params:
        for key, value in additional_params.items():
            spark_session = spark_session.config(key, value)

    spark = (
        spark_session
        .enableHiveSupport()
        .getOrCreate()
    )
    sc = spark.sparkContext

    return sc, spark


def _get_user_tcp_ports() -> List[str]:
    regexp = re.compile(r'-2e')
    envuser= os.getenv('HOSTNAME')
    if regexp.search(envuser):       
      _, user_name, user_surname = envuser.upper().split('-')
      user_full_name = '_'.join([user_name, user_surname])
    else:
      _, user_name  = envuser.upper().split('-') 
      user_full_name = user_name
    user_tcp_ports = [v for k, v in os.environ.items() if user_full_name in k and k.endswith('TCP_PORT')]
    return user_tcp_ports


def _get_free_ports(ports: List[str]):
    free_ports = []
    for port in ports:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            if s.connect_ex(('0.0.0.0', int(port))) != 0:
                free_ports.append(port)
    return free_ports


In [3]:
sc, spark = restart_spark(
                    name, 
                    21, 
                    executor_memory='5G', 
                    executor_cores=3, 
                    driver_memory='7G', 
                    additional_params={"spark.sql.shuffle.partitions": "300"})

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
sc.setLogLevel('ERROR')

In [5]:
import pandas as pd
import sys
from pyspark.sql import functions as F
import datetime
from datetime import timedelta

sys.path.append('/home/jovyan/glow-byte-filters-pyspark')
from logic_filters import * 
from segmentation import *

In [6]:
LOYALTY_CARDS = "hive_ssa_tc5.loyalty_card"
LOYALTY_CARDHOLDERS = "hive_ssa_tc5.loyalty_cardholder"
ACCOUNTS = "hive_ssa_tc5.account"
CVM5_GUESTS = "hive_cvm_acrm.cvm5_guests"

DIM_STORE = "hive_ssa_main.dim_store"
CHECKS_HEADERS = "hive_ssa_main.fct_rtl_txn"
CHECKS_ITEMS = "hive_ssa_main.fct_rtl_txn_item"
PRODUCTS = "hive_ssa_tc5.cvm_product"

### Выбираем гостей нужного юзкейса

In [7]:
domik_pd = (spark.table(PRODUCTS).filter(F.col('plu_brand_code') == '5910')).toPandas()

                                                                                

In [9]:
domik_pd.to_excel('domik.xlsx', index=False)

21/12/25 00:47:49 ERROR cluster.YarnClientSchedulerBackend: YARN application has exited unexpectedly with state FAILED! Check the YARN application logs for more details.
21/12/25 00:47:49 ERROR cluster.YarnClientSchedulerBackend: Diagnostics message: Attempt recovered after RM restartDue to executor failures all available nodes are blacklisted
21/12/25 00:47:49 ERROR client.TransportClient: Failed to send RPC RPC 6001073632394304271 to /192.168.234.35:54210: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(...)(Unknown Source)
21/12/25 00:47:49 ERROR cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Sending RequestExecutors(0,0,Map(),Set()) to AM was unsuccessful
java.io.IOException: Failed to send RPC RPC 6001073632394304271 to /192.168.234.35:54210: java.nio.channels.ClosedChannelException
	at org.apache.spark.network.client.TransportClient$RpcChannelListener.handleFailure(TransportClient.jav

In [8]:
domik_pd

Unnamed: 0,hive_dataflow_dttm,dataflow_id,dataflow_dttm,plu_rk,plu_id,plu_cmh_rk,plu_cmh_id,plu_nm,plu_type_dk,control_category_dk,...,plu_socially_significant_flg,price_segment_dk,price_segment_desc,abc_segment_dk,abc_segment_desc,vat_rate_amt,plu_origin_country_dk,plu_vendor_dk,plu_vendor_nm,alco_flg
0,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,5aba5c30764c0258444ac56e1676b3a6,41534,19520ff1ae1abe3dcfd7fa92e3d4efed,53601,ДОМИК В ДЕРЕВНЕ Сметана 20% 250г,ZFRE,SF,...,0,,,,,10,RU,,,0
1,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,b4b82c74bbda6f1d9e70d46ab8d9a379,42661,47875fc4f58459c2221a0dbfa6c18ec0,51874,Ряженка ДОМИК В ДЕРЕВНЕ 3.2% 515мл,ZFRE,SF,...,0,2,СЦС,3,30,10,RU,41000091,ВИММ-БИЛЬ-ДАНН,0
2,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,98b30dbcd311cd710b5c2e88489d8a0c,3016531,e9c0f878cb8bc2f6e4162338bf606eb3,519966,"ДОМИК В ДЕРЕВНЕ Молоко топленое 4,0%500г",ZFRE,SF,...,1,,,,,10,RU,,,0
3,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,13226543d26b0d4d891507cef39be99b,3246433,1c8d99039a336a371a85d0f41d450010,32015,ДОМИК В ДЕРЕВНЕ Сметана 20% 400г,ZFRE,SF,...,0,2,СЦС,3,30,10,RU,41000091,ВИММ-БИЛЬ-ДАНН,0
4,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,9bdc6c395cf05572c47a4d752c710c49,2127903,f40a4e2fdba543f3f38be24c6c89ed0c,479050,"Д.В.Д.Ряженка 2,5% 500г",ZFRE,SF,...,0,,,,,10,RU,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,9d55935a5f533da23d9479d834822a86,3408244,793993a90a57d9de775eed88fb5197ad,33339,ДОМИК В ДЕРЕВНЕ Продукт кеф.3.2%ПЕТ930гр,ZFRE,SF,...,0,,,4,40,10,RU,41000091,ВИММ-БИЛЬ-ДАНН,0
180,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,f1890fdd0726e752f56da9eceac87fc4,3016552,6a9c6b91fd7e9687b9aecff63a85c0d3,520397,"ДОМИК В ДЕРЕВНЕ Ряженка 4,0% 450г",ZFRE,SF,...,0,,,4,40,10,RU,41000091,ВИММ-БИЛЬ-ДАНН,0
181,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,8c16be742f598101f256975e023fb952,3408242,a5387830c206aecfc66fdba91406b5de,33338,ДОМИК В ДЕРЕВНЕ Продукт кеф.2.5%ПЕТ930гр,ZFRE,SF,...,0,,,4,40,10,RU,41000091,ВИММ-БИЛЬ-ДАНН,0
182,2021-12-24 07:45:30.922,118134,2021-12-24 07:40:55,fb71453e62740fe864f83d9ecb0232cd,82263,04dc7f83b12a75e7111a8d5df1e8073e,93047,Д.В.Д.Йогурт 3.9%ч.см.мел.135г,ZFRE,SF,...,0,,,,,10,RU,,,0
