###s000 environment setup

### s001 libraries

In [0]:
# libraries
import os
import pyspark
from pyspark import sql
from pyspark.sql import functions as f
from pyspark.sql import Window

### s002 sf connectivity

In [0]:
%run "./utils_spark_df"

In [0]:
%run "./utils_stratified_sampling"

In [0]:
# ------------- Use snowflake utility
sfUtils = sc._jvm.net.snowflake.spark.snowflake.Utils

# ------------ login to snowflake
password = dbutils.secrets.get(scope = "auea-kv-sbx-dxdtlprdct01", key = "sfdbrsdskey")

options = {
  "sfUrl": "vodafonenz_prod.australia-east.azure.snowflakecomputing.com/", 
  "sfUser": "SVC_LAB_DS_DATABRICKS",
  "pem_private_key": password.replace('\\n', '\n'),
  "sfDatabase": "LAB_ML_STORE",
  "sfSchema": "SANDBOX",
  "sfWarehouse": "LAB_DS_WH"
}

### s003 directories

In [0]:
dir_fs_data_parent = "/mnt/feature-store-prod-lab"
dir_mls_data_parent = "/mnt/ml-store-prod-lab/classification"

In [0]:
dir_mls_data_score = os.path.join(dir_mls_data_parent, "d400_model_score")

In [0]:
dir_fs_data_meta = os.path.join(dir_fs_data_parent, 'd000_meta')
dir_fs_data_raw =  os.path.join(dir_fs_data_parent, 'd100_raw')
dir_fs_data_int =  os.path.join(dir_fs_data_parent, "d200_intermediate")
dir_fs_data_prm =  os.path.join(dir_fs_data_parent, "d300_primary")
dir_fs_data_fea =  os.path.join(dir_fs_data_parent, "d400_feature")
dir_fs_data_target = os.path.join(dir_fs_data_parent, "d500_movement")
dir_fs_data_serv = os.path.join(dir_fs_data_parent, "d600_serving")

## s100 data import

In [0]:
vt_param_reporting_date = "2025-03-09"
vt_param_reporting_cycle_type = "rolling cycle"

### s101 feature store

In [0]:
df_fs_master = spark.read.format("delta").load(os.path.join(dir_fs_data_serv, "serv_mobile_oa_consumer"))

In [0]:
df_fs_unit_base = (
    spark
    .read
    .format("delta")
    .load(os.path.join(dir_fs_data_fea, "d401_mobile_oa_consumer/fea_unit_base"))
    .filter(f.col("reporting_cycle_type") == 'rolling cycle')
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .select("reporting_date","reporting_cycle_type","fs_cust_id","fs_acct_id","fs_srvc_id","product_holding_desc")
)
display(df_fs_unit_base.limit(3))

### s102 ml store

In [0]:
#df_mls_score_dr_apple = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_apple_pred30d"))
# df_mls_score_upsell_plan = spark.read.format("delta").load("/mnt/feature-store-dev/dev_users/dev_rz/d999_testing/score_data_params_mobile_oa_consumer_srvc_upsell_plan_endless_pred30d_202405_exp2_v1_20240602")
df_mls_score_xsell_bb = spark.read.format("delta").load("/mnt/feature-store-dev/dev_users/dev_rz/d999_testing/score_data_params_mobile_oa_consumer_srvc_xsell_bb_pred30d_202405_exp1_v1_20250302")
# df_mls_score_ifp = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_upsell_ifp_pred30d"))
df_mls_score_churn = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_churn_pred30d"))

In [0]:
dir_data_sfmc = "/mnt/prod_sfmc/imports/DataAnalytics/"

# previous wallet campaign 
df_campaign_hist_01 = spark.read.options(header = True).csv(f"{dir_data_sfmc}/250305-RM-MOBPM-Wallet-Samsung-Galaxy-SEND-S24-WASHV2.csv")
df_campaign_hist_02 =  spark.read.options(header = True).csv(f"{dir_data_sfmc}/250305-RM-MOBPM-Wallet-Samsung-Galaxy-SEND-S24FE-WASH.csv")


### s103 active bb base

In [0]:
query_bb_active = """
select 
f_serv.service_id, 
f_serv.billing_account_number,
f_serv.customer_id, 
f_serv.record_start_date_time,
f_serv.record_end_date_time, 
f_serv.current_record_flag, 
service_type_name,
service_access_type_name, 
service_status_name, 
connection_activation_date, 
connection_deactivation_date, 
connect_order_date, 
plan_name, 
-- pt."L1 GROUP" as plan_group_L1 ,
-- pt."L2 GROUP" as plan_group_L2 ,
-- pm."RECURRING MARGIN" as recurring_margin,
proposition_name, 
customer_mkt_segment
from prod_mar_tech.modelled.f_service_relationship_curr f_serv 
inner join prod_mar_tech.modelled.d_service_curr d_serv 
on f_serv.d_service_key = d_serv.d_service_key
inner join prod_mar_tech.modelled.d_billing_account_curr d_billing 
on d_billing.d_billing_account_key = f_serv.d_billing_account_key
inner join prod_mar_tech.modelled.d_customer_curr d_cust 
on d_cust.d_customer_key = f_serv.d_customer_key
-- left join lab_ml_store.sandbox.sc_bb_plan_type_martech pt on pt."PLAN NAME" = d_serv.plan_name
-- left join lab_ml_store.sandbox.sc_bb_plan_margin pm on pm."BROADBAND  PLAN TYPE" = pt."L1 GROUP"
where 
1=1 
and customer_mkt_segment in ('Consumer')
and ( 
    SERVICE_TYPE_NAME in ('Broadband') 
    or proposition_name in (
    'home phone wireless'
    , 'Home Wireless Broadband'
    , 'Fixed Consumer'
    , 'Home Wireless Broadband plus Calling'
    )
)
and proposition_name not in ('Consumer SuperWifi', 'Business SuperWifi')
and service_access_type_name not in ('LLR')
and service_status_name in ( 'Active', 'Transferred', 'Suspended');
"""

In [0]:
df_bb_base = (
    spark
    .read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , query_bb_active
    )
    .load()
)

### s104 non converged customer

In [0]:
df_non_converged = (
    df_fs_unit_base
    .filter(f.col('reporting_date') == vt_param_reporting_date)
    .join(
        df_bb_base
        , f.col('fs_cust_id') == f.col('customer_id') 
        , 'anti')
    .select('reporting_date', 'reporting_cycle_type', 'fs_cust_id', 'fs_acct_id', 'fs_srvc_id')
)

In [0]:
display(df_non_converged.count())

display(df_non_converged.limit(10))

display(df_non_converged
       .filter(f.lower(f.col('product_holding_desc')).contains('broadband'))
        ) # inactive 

### s105 global control

In [0]:
df_gc_curr = spark.read.format("delta").load("/mnt/ml-store-dev/dev_users/dev_el/marketing_programs/global_control/mobile_oa_consumer")

df_gc_curr = (
    df_gc_curr
    .select("fs_cust_id")
    .distinct()
)

print(df_gc_curr.count())

display(
    df_gc_curr
    .limit(10)
)

### s106 email address filter

In [0]:
df_martech_cust = (
     spark.read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , """
            select *
            , customer_id as fs_cust_id
            from PROD_MAR_TECH.SERVING.EXPORT_CUSTOMER
        """
    )
    .load()
)

df_martech_cust = lower_col_names(df_martech_cust)

display(df_martech_cust.limit(10))

display(df_martech_cust.groupBy("customer_id").count().filter(f.col("count")>1))

In [0]:
df_martech_unsub = (
    spark
    .read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , """
            select *
            , contactkey as fs_cust_id
            from PROD_MAR_TECH.SERVING.SFMC_MASTER_PREFERENCE
        """
    )
    .load()
)

df_martech_unsub = lower_col_names(df_martech_unsub)

display(df_martech_unsub.limit(10))

display(df_martech_unsub.groupBy("fs_cust_id").count().filter(f.col("count")>1))

In [0]:
display(
    df_martech_unsub
    .filter(f.col("fs_cust_id") == "1-MO838RP")
)

display(
    df_martech_cust    
    .filter(f.col("customer_id") == "1-MO838RP")
)

In [0]:
df_martech_bill = (
     spark.read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , """
            select *
            , customer_id as fs_cust_id
            , billing_account_number as fs_acct_id
            from PROD_MAR_TECH.SERVING.EXPORT_BILL_TO
        """
    )
    .load()
)

df_martech_bill = lower_col_names(df_martech_bill)

display(
    df_martech_bill
    .filter(f.col("fs_acct_id") == "413426010")
    .limit(10)
)

display(
    df_martech_bill
    .groupBy("fs_acct_id")
    .agg(
        f.count('*').alias("count")
        , f.collect_list("customer_id").alias("customer_ids")
    )
    .filter(f.col("count")>1)
)

In [0]:
df_martech = (
    df_martech_bill
    .groupBy("fs_acct_id")
    .agg(
        f.max("fs_cust_id").alias("fs_cust_id")
        , f.max("billing_profile_email").alias("billing_profile_email")
    )
    .distinct()
    .join(
        df_martech_cust
        .groupBy("fs_cust_id")
        .agg(
            f.max("customer_primary_contact_email").alias("customer_primary_contact_email")
        )
        , ["fs_cust_id"]
        , "left"
    )
    .filter(f.col("fs_acct_id").isNotNull())
    .withColumn(
        "email_address"
        , f.when(f.col("customer_primary_contact_email") == "Unknown", f.col("billing_profile_email"))
        .otherwise(
            f.coalesce(f.col("customer_primary_contact_email"),f.col("billing_profile_email"))
        )
    )
)

display(df_martech.limit(100))

display(df_martech.filter(f.col("email_address").isNull()).count())

display(df_martech.groupBy("fs_acct_id").count().filter(f.col("count")>1))

## s200 Data processing

### s201 base candidate

In [0]:
df_base_full = (
    df_fs_master
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .join(
        df_non_converged
        , ['reporting_date','reporting_cycle_type','fs_acct_id','fs_cust_id','fs_srvc_id']
        , 'inner'
    )
    .join(
        df_fs_unit_base
        , ['reporting_date','reporting_cycle_type','fs_acct_id','fs_cust_id','fs_srvc_id']
        , 'inner'
    )
    .filter(~f.col("product_holding_desc").ilike("%broadband%"))
)

In [0]:
display(df_base_full.limit(10))
display(df_base_full.count())

### s202 exclusion flag

In [0]:
# current global control
df_tmp_excl_01 = (
    df_gc_curr
    .select("fs_cust_id")
    .distinct()
    .withColumn(
        "gc_curr_flag"
        , f.lit('Y')
    )
)

# current active campaign 
df_tmp_excl_02_01 = (
    df_campaign_hist_01
    .select(f.col("Contact_Key").alias('fs_cust_id'))
    .distinct()
    .withColumn(
        "ch01_flag"
        , f.lit('Y')
    )
)

#current active campaign
df_tmp_excl_02_02 = (
    df_campaign_hist_02
    .select(f.col("Contact_Key").alias('fs_cust_id'))
    .distinct()
    .withColumn(
        "ch02_flag"
        , f.lit('Y')
    )
)

# customer without emails
df_tmp_excl_06 = (
    df_martech
    .filter(f.col("email_address") == "Unknown")
    .select("fs_acct_id")
    .distinct()
    .withColumn(
        "no_email_flag"
        , f.lit('Y')
    )
)

# customer unsub email
df_tmp_excl_09 = (
    df_martech_unsub
    .filter(f.col("unsubscribedemail") == 'true')
    .select("fs_cust_id")
    .distinct()
    .withColumn(
        "unsub_email_flag"
        , f.lit('Y')
    )
)


### s203 ML score

In [0]:
df_base_score_xsell_bb = (
    df_mls_score_xsell_bb
    .filter(f.col("reporting_date") == '2025-03-02')
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id', 'fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_acct')==1)
    .select(
        "fs_cust_id"
        , "fs_acct_id"
        # , "fs_srvc_id"
        , f.col("reporting_date").alias("xsell_predict_date")
        , f.col("propensity_score").alias("xsell_score")
        , f.col("propensity_segment_qt").alias("xsell_segment")
        , f.col("propensity_top_ntile").alias("xsell_top_ntile")
    )
    .distinct()
)

df_base_score_churn = (
    df_mls_score_churn
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id','fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_acct')==1)
    .select(
        "fs_cust_id"
        , "fs_acct_id"
        #, "fs_srvc_id"
        , f.col("predict_date").alias("churn_predict_date")
        , f.col("propensity_score").alias("churn_score")
        , f.col("propensity_segment_qt").alias("churn_segment")
        , f.col("propensity_top_ntile").alias("churn_top_ntile")
    )
    .distinct()
)

In [0]:
display(
    df_base_score_churn
    .agg(
        f.count('*')
        , f.countDistinct('fs_acct_id')
    )
)

display(
    df_base_score_xsell_bb
    .agg(
        f.count('*')
        , f.countDistinct('fs_acct_id')
    )
)

### s204 proc candidate

In [0]:
df_proc_full = (
    df_base_full
    .select(
        "reporting_date"
        , "reporting_cycle_type"
        , "fs_cust_id"
        , "fs_acct_id"
        # , "fs_srvc_id"
        # , 'srvc_privacy_flag'
        # , "num_of_active_srvc_cnt"
        # , "plan_name"
        # , "plan_tenure"
        # , "plan_name_std"
        # , "plan_legacy_flag"
        # , "plan_amt"
        # , "plan_discount_flag"
        # , "plan_discount_cnt"
        # , "plan_discount_amt"
        # , "product_holding_desc"
    )
    .distinct()
    .join(
        df_base_score_xsell_bb
        #, ["fs_cust_id", "fs_srvc_id", "fs_acct_id"]
        , ['fs_cust_id', 'fs_acct_id']
        , "left"
    )
    .join(
        df_base_score_churn
        #, ["fs_cust_id", "fs_srvc_id", "fs_acct_id"]
        , ['fs_cust_id', 'fs_acct_id']
        , "left"
    )
    .join(
        df_tmp_excl_01
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_tmp_excl_02_01
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_tmp_excl_02_02
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_tmp_excl_06
        , ["fs_acct_id"]
        , "left"
    )
    .join(
        df_tmp_excl_09
        , ["fs_cust_id"]
        , "left"
    )
    .fillna(
        value='N'
        , subset=[
            'gc_curr_flag'
            ,'ch01_flag'
            ,'ch02_flag'
            , "no_email_flag"
            , "unsub_email_flag"
        ]
    )
    .withColumn(
        "target_segment"
        # , f.when(
        #     f.col('srvc_privacy_flag') == 'N'
        #     , f.lit("z1.opt out")
        # )
        ,f.when(
            (
                (f.col("gc_curr_flag") == 'Y')
            )
            , f.lit("z2.global control - curr")
        )
        .when(
            (
                (f.col("ch01_flag") == 'Y') 
                |(f.col("ch02_flag") == 'Y')

            )
            , f.lit("z4.wallet campaign")
        )
        .when(
            (
                (f.col("no_email_flag") == 'Y')
            )
            , f.lit("z6.no primary email")
        )
        .when(
            (
                (f.col("unsub_email_flag") == 'Y')
            )
            , f.lit("z9.unsubscribed from email")
        )
        .otherwise(f.lit("a.target"))
    )
    .filter(f.col("xsell_segment").isNotNull())
    .withColumn("churn_top_ntile", f.ntile(30).over(Window.orderBy(f.desc("churn_score"))))
    .withColumn("xsell_top_ntile", f.ntile(30).over(Window.orderBy(f.desc("xsell_score")))) 
    .distinct()
)

In [0]:
display(
    df_proc_full
    .groupBy("target_segment")
    .agg(
        f.count("*")
        #, f.countDistinct("fs_srvc_id")
        , f.countDistinct('fs_cust_id')
        , f.countDistinct('fs_acct_id')
        , f.countDistinct(f.concat('fs_acct_id', 'fs_cust_id'))
    )
    .orderBy("target_segment")
)

display(df_proc_full.limit(100))

### s205 campaign base

In [0]:
df_campaign_full = (
    df_proc_full
    .withColumn(
        "churn_rank"
        , f.row_number().over(
            Window
            .partitionBy(f.lit(1))
            .orderBy(f.desc("churn_score"))
        )
    )
    .withColumn(
        "xsell_rank"
        , f.row_number().over(
            Window
            .partitionBy(f.lit(1))
            .orderBy(f.desc("xsell_score"))
        )
    )
)

In [0]:
display(
    df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .groupBy("churn_segment")
    .agg(f.countDistinct("fs_acct_id").alias("conn"))
    .withColumn(
        "conn_tot"
        , f.sum("conn").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("conn") / f.col("conn_tot")
    )
)

display(
    df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .groupBy("xsell_segment")
    .agg(f.countDistinct("fs_acct_id").alias("conn"))
    .withColumn(
        "conn_tot"
        , f.sum("conn").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("conn") / f.col("conn_tot")
    )
)


In [0]:
(
    df_campaign_full
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell') 
)

In [0]:
df_campaign_full = spark.read.format("delta").load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell')

In [0]:
display(
    df_campaign_full
    .filter(f.col("fs_acct_id") == "408820791")
)

In [0]:
display(
    df_campaign_full
    .filter(f.col("xsell_segment") == "H")
   #.filter(f.col('churn_segment') != "H")
    .filter(f.col("target_segment") == 'a.target')
    # .filter(f.col("primary_rank") == 1)
    .agg(
        f.countDistinct("fs_acct_id")
    )
)

display(
    df_campaign_full
    .filter(f.col("xsell_segment") == "M")
   #.filter(f.col('churn_segment') != "H")
    .filter(f.col("target_segment") == 'a.target')
    # .filter(f.col("primary_rank") == 1)
    .agg(
        f.countDistinct("fs_acct_id")
    )
)

display(
    df_campaign_full
    #.filter(f.col("xsell_segment").isin(["H", "M"]))
   #.filter(f.col('churn_segment') != "H")
    .filter(f.col("target_segment") != 'a.target')
    .filter(f.col("ch02_flag") != 'Y')
    .filter(f.col("ch01_flag") != 'Y')
    # .filter(f.col("primary_rank") == 1)
    .agg(
        f.countDistinct("fs_acct_id")
    )
)



### s206 local control - H

In [0]:
# organic conversion around 0.35% 
# last campaign conversion H ~ 0.69% 
# last campaign conversion M ~ 0.57% 

vt_param_target_size = 75000
vt_param_control_size = 10000
ls_param_strata = [ 'churn_top_ntile', 'xsell_top_ntile']


df_sample_target = create_sample_target(
     df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .filter(f.col("xsell_segment").isin(["H", "M"]))
   # .filter(f.col('churn_segment') != "H")
    , ls_param_strata
    )

#display(df_sample_target.limit(10))

df_campaign_cand_control = find_similar_sample(
    df_campaign_full
    .filter(f.col("target_segment") != 'a.target')
    .filter(f.col("xsell_segment").isin(["H", "M"]))
    #.filter(f.col("ch02_flag") != 'Y')
    #.filter(f.col("ch01_flag") != 'Y')
   #.filter(f.col('churn_segment') != "H")
    , size = vt_param_control_size
    , strata = ls_param_strata
    , df_target = df_sample_target
)

df_campaign_cand_target = find_similar_sample(
    df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .filter(f.col("xsell_segment").isin("H", "M"))
    #.filter(f.col('churn_segment') != "H")
    #.filter(f.col("wallet_eligibility_flag") == 'Y')
    , size = vt_param_target_size
    , strata = ls_param_strata
    , df_target = df_sample_target
)


print("control")
display(
    df_campaign_cand_control
    .agg(
        f.countDistinct("fs_acct_id")
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)


print("target")
display(
    df_campaign_cand_target
    .agg(
        f.countDistinct("fs_acct_id")
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)

evaluate_sample(df_campaign_cand_control, df_campaign_cand_target,['xsell_score', 'churn_score'] )

In [0]:
display(df_campaign_cand_target
        .join(df_campaign_cand_control, ['fs_acct_id', 'fs_cust_id'], 'inner')      
        )

In [0]:
(
    df_campaign_cand_target
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell_target') 
)

(
    df_campaign_cand_control
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell_control') 
)

In [0]:
df_campaign_cand_target = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell_target')
df_campaign_cand_control = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/bbxsell/20250311_bbxsell_control')

In [0]:
print("control")
display(
    df_campaign_cand_control
    .agg(
        f.countDistinct("fs_acct_id")
        , f.countDistinct('fs_cust_id')
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)


print("target")
display(
    df_campaign_cand_target
    .agg(
        f.countDistinct("fs_acct_id")
        , f.countDistinct('fs_cust_id')
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)
evaluate_sample(df_campaign_cand_control, df_campaign_cand_target,['xsell_score', 'churn_score'] )

In [0]:
display(df_campaign_cand_target
        .join(df_campaign_cand_control, ['fs_acct_id', 'fs_cust_id'], 'inner')
        )

display(df_campaign_cand_target
        .join(df_campaign_cand_control, ['fs_acct_id'], 'inner')
        )

display(df_campaign_cand_target
        .join(df_campaign_cand_control, ['fs_cust_id'], 'inner')
        )

In [0]:
display(df_campaign_cand_target
        .join(df_bb_base,f.col('fs_cust_id') == f.col('customer_id'), 'inner')
        )

display(df_campaign_cand_target
        .join(df_bb_base,f.col('fs_acct_id') == f.col('billing_account_number'), 'inner')
        )

In [0]:
display(
    df_campaign_cand_target
        .groupBy('xsell_segment')
        .agg(f.countDistinct('fs_cust_id')
             , f.count('*')
             )
        )

display(
    df_campaign_cand_control
        .groupBy('xsell_segment')
        .agg(f.countDistinct('fs_cust_id')
             , f.count('*')
             )
        )


In [0]:
df_output_campaign = (
    df_campaign_cand_control
    .withColumn('target_cohort', f.lit('Local Control'))
    .union(
        df_campaign_cand_target
        .withColumn('target_cohort', f.lit('Target')) 
    )
)

In [0]:
display(
    df_output_campaign
    .groupBy('target_cohort')
    .agg(
        f.count('*')
        , f.countDistinct('fs_cust_id')
        , f.countDistinct('fs_acct_id')
        , f.median('churn_score')
        , f.median('xsell_score')
    )    
)

In [0]:
df_sfmc_export = (
    df_output_campaign
    .filter(f.col('target_cohort')!= 'Local Control')
    .select(
        f.col('fs_cust_id').alias('contact_key')
        , f.current_date().alias('data_update_date')
    )
    .distinct()
)


display(df_sfmc_export.count())
display(df_sfmc_export.limit(10))

display(df_sfmc_export
        .agg(f.count('*')
             , f.countDistinct('contact_key')
             )
)


In [0]:
dir_data_sfmc = "/mnt/prod_sfmc/imports/DataAnalytics/"

(
    df_sfmc_export
    .toPandas()
    .to_csv(f"/dbfs{dir_data_sfmc}/250312-SH-FixBB-Join-Credit.csv", index=False)
)

In [0]:
f"/dbfs{dir_data_sfmc}/250312-SH-FixBB-Join-Credit.csv"

In [0]:
df_check = spark.read.options(header=True).csv(f"{dir_data_sfmc}250312-SH-FixBB-Join-Credit.csv")

In [0]:
display(df_check.limit(10))

In [0]:
dir_data_sfmc = "/mnt/prod_sfmc/imports/DataAnalytics/"

df_check = spark.read.options(header=True).csv(f"{dir_data_sfmc}3-14-2025-250313-SH-FIX-Broadband-Join-Credit-SendDE.csv")

In [0]:
display(
    df_check
    .agg(
        f.count('*')
        , f.countDistinct('contact_key')    
    )
)

display(df_check.limit(10))

## s300 Recheck Wash list

In [0]:
dir_data_sfmc = "/mnt/prod_sfmc/imports/DataAnalytics/"

df_bb_test = spark.read.options(header=True).csv(f"{dir_data_sfmc}3-14-2025-250313-SH-FIX-Broadband-Join-Credit-SendDE.csv")

df_bb_test =  (
    df_bb_test
    .filter(f.col('contact_key').isNotNull())
    .filter(~f.col('contact_key').contains('Consumer'))
)


In [0]:
display(
        df_output_campaign
        .join(df_bb_test, f.col('fs_cust_id') == f.col('contact_key'), 'inner')
        .agg(f.countDistinct('fs_cust_id'))
)

In [0]:
display(df_bb_test.limit(10))

display(
    df_bb_test
    .filter(f.col('contact_key').isNotNull())
    .filter(~f.col('contact_key').contains('Consumer'))
    .agg(
        f.countDistinct('contact_key')
        , f.count('*')
    )
)

In [0]:
display(
    df_output_campaign
    .join(df_bb_test, f.col('fs_cust_id') == f.col('contact_key'), 'inner')
    .agg(f.count('*'), f.countDistinct('fs_cust_id'))
)

In [0]:
print("control")
display(
    df_campaign_cand_control
    .agg(
        f.countDistinct("fs_acct_id")
        , f.countDistinct('fs_cust_id')
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)

print("test")

display(
    df_output_campaign
    .join(df_bb_test, f.col('fs_cust_id') == f.col('contact_key'), 'inner')
    .agg(
        f.countDistinct("fs_acct_id")
        , f.countDistinct('fs_cust_id')
        , f.count('*')
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("xsell_score")
        , f.avg("xsell_score")
    )
)