###s000 environment setup

### s001 libraries

In [0]:
# libraries
import os
import pyspark
from pyspark import sql
from pyspark.sql import functions as f
from pyspark.sql import Window

### s002 sf connectivity

In [0]:
%run "./utils_spark_df"

In [0]:
%run "./utils_stratified_sampling"

In [0]:
# ------------- Use snowflake utility
sfUtils = sc._jvm.net.snowflake.spark.snowflake.Utils

# ------------ login to snowflake
password = dbutils.secrets.get(scope = "auea-kv-sbx-dxdtlprdct01", key = "sfdbrsdskey")

options = {
  "sfUrl": "vodafonenz_prod.australia-east.azure.snowflakecomputing.com/", 
  "sfUser": "SVC_LAB_DS_DATABRICKS",
  "pem_private_key": password.replace('\\n', '\n'),
  "sfDatabase": "LAB_ML_STORE",
  "sfSchema": "SANDBOX",
  "sfWarehouse": "LAB_DS_WH"
}

In [0]:
query_sfmc_send = """
select * from PROD_MAR_TECH.SERVING.SFMC_EMAIL_PERFORMANCE
WHERE EMAILNAME IN ('250313-SH-FIX-EML-M-CRS-P4-Broadband-Join-Credit') 
AND CUSTOMER_ID NOT ILIKE 'TEST%';
"""

In [0]:
df_sfmc_base = (
    spark
    .read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , query_sfmc_send
    )
    .load()
)

In [0]:
query_bb_order = """
select * from LAB_ML_STORE.SANDBOX.SC_CONSUMER_BB_MARTECH_ALL_CONNECTIONS_ORDERS;
"""

In [0]:
df_bb_order = (
    spark
    .read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , query_bb_order
    )
    .load()
)

In [0]:
df_campaign_full = spark.read.format("delta").load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell')

In [0]:
display(
    df_sfmc_base
    .select(
        f.col('customer_id').alias('fs_cust_id')
    )
    .distinct()
    .join(df_campaign_full, ['fs_cust_id'], 'inner')
    .agg(
        f.countDistinct('fs_cust_id')
        , f.countDistinct('fs_acct_id')
        , f.count('*')
    )
)

df_sfmc_send =   (
    df_sfmc_base
    .select(
        f.col('customer_id').alias('fs_cust_id')
    )
    .distinct()
    .join(df_campaign_full, ['fs_cust_id'], 'inner')
)

### s003 regen control

In [0]:
vt_param_control_size = 30000
ls_param_strata = [ 'churn_top_ntile', 'xsell_top_ntile']

df_sfmc_send =   (
    df_sfmc_base
    .select(
        f.col('customer_id').alias('fs_cust_id')
    )
    .distinct()
    .join(df_campaign_full, ['fs_cust_id'], 'inner')
)

df_sample_target = create_sample_target(
     df_sfmc_send
    .filter(f.col("target_segment") == 'a.target')
    .filter(f.col("xsell_segment").isin(["H", "M"]))
   # .filter(f.col('churn_segment') != "H")
    , ls_param_strata
    )



df_campaign_cand_control = find_similar_sample(
    df_campaign_full
    .join(df_sfmc_send, ['fs_cust_id'], 'anti')
    #.filter(f.col("xsell_segment").isin(["H", "M"]))
    #.filter(f.col("ch02_flag") != 'Y')
    #.filter(f.col("ch01_flag") != 'Y')
   #.filter(f.col('churn_segment') != "H")
    , size = vt_param_control_size
    , strata = ls_param_strata
    , df_target = df_sample_target
)


print("control")
display(
    df_campaign_cand_control
    .agg(
        f.countDistinct("fs_acct_id")
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)


print("target")
display(
    df_sfmc_send
    .agg(
        f.countDistinct("fs_acct_id")
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)

evaluate_sample(df_campaign_cand_control, df_sfmc_send,['xsell_score', 'churn_score'] )

In [0]:
display(
    df_sfmc_send
    .join(df_campaign_cand_control, ['fs_acct_id', 'fs_cust_id'], 'inner')      
)

display(
    df_sfmc_send
    .join(df_campaign_cand_control, ['fs_acct_id'], 'inner')      
)

display(
    df_sfmc_send
    .join(df_campaign_cand_control, ['fs_cust_id'], 'inner')      
)

In [0]:

(
    df_campaign_cand_control
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell_regen_control') 
)

In [0]:
df_campaign_cand_control = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell_regen_control')

In [0]:
display(df_campaign_cand_control.count())
display(df_bb_order.limit(10))

In [0]:
display(
    df_campaign_cand_control
    .join(
        df_bb_order
        .filter(f.col('CONNECT_ORDER_DATE') >= '2025-03-12')  
        .filter(f.col('CONNECT_ORDER_DATE') <= '2025-03-31')
        , f.col('fs_cust_id') == f.col('customer_id'), 'inner'
    )
    .agg(
        f.countDistinct('fs_acct_id')
        , f.countDistinct('fs_cust_id')
        , f.count('*')
    )
)


display(df_campaign_cand_control.count())

In [0]:
display(
    df_campaign_cand_control
    .join(
        df_bb_order
        .filter(f.col('CONNECT_ORDER_DATE') >= '2025-03-12')  
        .filter(f.col('CONNECT_ORDER_DATE') <= '2025-03-31')
        , f.col('fs_cust_id') == f.col('customer_id'), 'inner'
    )
    .filter(f.col('PLAN_NAME').isin(
        'Unlimited - UltraFast HFC Max'
        , 'Fibre Everyday (Unlimited Broadband)'
        , 'Fibre Max (Unlimited Broadband)'
        , 'Wireless Unlimited'
        )
    )
    .groupBy('PLAN_NAME')
    .agg(
        f.countDistinct('fs_acct_id')
        , f.countDistinct('fs_cust_id')
        , f.count('*')
    )
)


display(df_campaign_cand_control.count())

In [0]:
display(
    df_sfmc_send
    .join(
        df_bb_order
        .filter(f.col('CONNECT_ORDER_DATE') >= '2025-03-12')
        .filter(f.col('CONNECT_ORDER_DATE') <= '2025-03-31')  
          , f.col('fs_cust_id') == f.col('customer_id'), 'inner')
    .agg(
        f.countDistinct('fs_acct_id')
         , f.countDistinct('fs_cust_id')
         , f.count('*')
    )
)

display(df_sfmc_send.count())

In [0]:
display(
    df_sfmc_send
    .join(
        df_bb_order
        .filter(f.col('CONNECT_ORDER_DATE') >= '2025-03-12')
        .filter(f.col('CONNECT_ORDER_DATE') <= '2025-04-01')  
          , f.col('fs_cust_id') == f.col('customer_id'), 'inner')
    .groupBy('PLAN_NAME')
    .agg(
        f.countDistinct('fs_acct_id')
         , f.countDistinct('fs_cust_id')
         , f.count('*')
    )
)


In [0]:
display(
    df_sfmc_send
    .join(
        df_bb_order
        .filter(f.col('CONNECT_ORDER_DATE') >= '2025-03-12')
        .filter(f.col('CONNECT_ORDER_DATE') <= '2025-04-01')  
          , f.col('fs_cust_id') == f.col('customer_id'), 'inner')
    .filter(f.col('PLAN_NAME') == 'Fibre Everyday (Unlimited Broadband)')
)
