### s000 environment setup

In [0]:
# libraries
import os

import pyspark
from pyspark import sql
from pyspark.sql import functions as f
from pyspark.sql import Window

In [0]:
%run "./utils_spark_df"

In [0]:
%run "./utils_stratified_sampling"

In [0]:
%run "./sc_utils_stratified_sampling"

In [0]:
# ------------- Use snowflake utility
sfUtils = sc._jvm.net.snowflake.spark.snowflake.Utils

# ------------ login to snowflake
password = dbutils.secrets.get(scope = "auea-kv-sbx-dxdtlprdct01", key = "sfdbrsdskey")

options = {
  "sfUrl": "vodafonenz_prod.australia-east.azure.snowflakecomputing.com/", 
  "sfUser": "SVC_LAB_DS_DATABRICKS",
  "pem_private_key": password.replace('\\n', '\n'),
  "sfDatabase": "LAB_ML_STORE",
  "sfSchema": "SANDBOX",
  "sfWarehouse": "LAB_DS_WH"
}

In [0]:
dir_fs_data_parent = "/mnt/feature-store-prod-lab"
dir_mls_data_parent = "/mnt/ml-store-prod-lab/classification"

In [0]:
dir_mls_data_score = os.path.join(dir_mls_data_parent, "d400_model_score")

In [0]:
dir_fs_data_meta = os.path.join(dir_fs_data_parent, 'd000_meta')
dir_fs_data_raw =  os.path.join(dir_fs_data_parent, 'd100_raw')
dir_fs_data_int =  os.path.join(dir_fs_data_parent, "d200_intermediate")
dir_fs_data_prm =  os.path.join(dir_fs_data_parent, "d300_primary")
dir_fs_data_fea =  os.path.join(dir_fs_data_parent, "d400_feature")
dir_fs_data_target = os.path.join(dir_fs_data_parent, "d500_movement")
dir_fs_data_serv = os.path.join(dir_fs_data_parent, "d600_serving")

### s100 data import

In [0]:
# vt_param_reporting_date = "2024-09-29"
vt_param_reporting_date = "2025-01-19"
vt_param_reporting_cycle_type = "rolling cycle"

In [0]:
df_fs_master = spark.read.format("delta").load(os.path.join(dir_fs_data_serv, "serv_mobile_oa_consumer"))
df_fs_ifp_srvc = spark.read.format("delta").load(os.path.join(dir_fs_data_fea, "d401_mobile_oa_consumer/fea_ifp_device_on_service"))
df_fs_ifp_bill = spark.read.format("delta").load(os.path.join(dir_fs_data_fea, "d401_mobile_oa_consumer/fea_ifp_device_on_bill"))
df_mvnt_ifp_srvc = spark.read.format('delta').load(os.path.join(dir_fs_data_target, "d501_mobile_oa_consumer/mvmt_ifp_upsell_on_service"))
df_mvnt_ifp_bill = spark.read.format('delta').load(os.path.join(dir_fs_data_target, "d501_mobile_oa_consumer/mvmt_ifp_upsell_on_bill"))

In [0]:
#df_mls_score_dr_apple = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_apple_pred30d"))
#df_mls_score_dr_apple = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_apple_pred30d"))
df_mls_score_dr = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_pred30d"))
df_mls_score_ifp = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_upsell_ifp_pred30d"))
df_mls_score_churn = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_churn_pred30d"))
df_mls_score_ar = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_writeoff_pred120d"))

In [0]:
df_output_campaign_email = spark.read.format("delta").load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/250123-JS-MOBPM-Samsung-NPI-Consumer_Email')
df_output_campaign_sms = spark.read.format("delta").load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/250123-JS-MOBPM-Samsung-NPI-Consumer_SMS')

In [0]:
query_one_upgrade = """select * from PROD_MAR_TECH.SERVING.export_bill_ifp where IFP_PHONE_UPGRADE_FLAG = 'Y';"""

df_one_upgrade = (
    spark.read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , query_one_upgrade
    )
    .load()
)



In [0]:
query_sfmc = """ 
select  * from PROD_MAR_TECH.SERVING.SFMC_EMAIL_PERFORMANCE
where campaignname in ('250123-JS-MOBPM-EML-M-DEV-P4-Samsung-NPI-Consumer_Email')
"""


df_sfmc_email= (
    spark.read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , query_sfmc
    )
    .load()
)


query_sfmc_sms = """
select * from PROD_MAR_TECH.SERVING.SFMC_ON_NET_SMS_MESSAGE
where sms_name in ('250123-JS-MOBPM-SMS-M-DEV-P4-Samsung-NPI-Consumer');
"""


df_sfmc_sms= (
    spark.read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , query_sfmc_sms
    )
    .load()
)

In [0]:
(
    df_sfmc_email
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/output_leads/250123_Samsung_NPI_Consumer_Email') 
)

(
    df_sfmc_sms
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/output_leads/250123_Samsung_NPI_Consumer_SMS') 
)



In [0]:
df_sfmc_sms = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/output_leads/250123_Samsung_NPI_Consumer_SMS')

df_sfmc_email = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/output_leads/250123_Samsung_NPI_Consumer_Email')

In [0]:
df_base_score_ifp = (
    df_mls_score_ifp
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id', 'fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_acct')==1)
    .withColumn(
        'rank_cust'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_cust')==1)
    .select(
        "fs_cust_id"
        # , "fs_acct_id"
        # , "fs_srvc_id"
        , f.col("propensity_score").alias("ifp_score")
        , f.col("propensity_segment_qt").alias("ifp_segment")
        , f.col("propensity_top_ntile").alias("ifp_top_ntile")
    )
   
)

df_base_score_dr = (
    df_mls_score_dr
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id','fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_acct')==1)
    .withColumn(
        'rank_cust'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_cust')==1)
    .select(
        "fs_cust_id"
        # , "fs_acct_id"
        # , "fs_srvc_id"
        , f.col("propensity_score").alias("dr_score")
        , f.col("propensity_segment_qt").alias("dr_segment")
        , f.col("propensity_top_ntile").alias("dr_top_ntile")
    )   
)

df_base_score_churn = (
    df_mls_score_churn
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id','fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_acct')==1)
    .withColumn(
        'rank_cust'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_cust')==1)
    .select(
        "fs_cust_id"
        # , "fs_acct_id"
        # , "fs_srvc_id"
        , f.col("propensity_score").alias("churn_score")
        , f.col("propensity_segment_qt").alias("churn_segment")
        , f.col("propensity_top_ntile").alias("churn_top_ntile")
    )
)

df_base_score_ar = (
    df_mls_score_ar
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id','fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_acct')==1)
    .withColumn(
        'rank_cust'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    .filter(f.col('rank_cust')==1)
    .select(
        "fs_cust_id"
        # , "fs_acct_id"
        # , "fs_srvc_id"
        , f.col("propensity_score").alias("risk_score")
        , f.col("propensity_segment_qt").alias("risk_segment")
        , f.col("propensity_top_ntile").alias("risk_top_ntile")
    )
)

In [0]:
df_email_output=(
    df_sfmc_email.alias('a')
    .join(df_base_score_ifp.alias('c'), f.col('a.CUSTOMER_ID') == f.col('c.fs_cust_id'), 'inner')
    .join(df_base_score_dr.alias('d'), f.col('a.CUSTOMER_ID') == f.col('d.fs_cust_id'), 'inner')
    .join(df_base_score_churn.alias('e'), f.col('a.CUSTOMER_ID') == f.col('e.fs_cust_id'), 'inner')
    #.withColumn('index', f.count('*').over(Window.partitionBy('a.CUSTOMER_ID')))
    #.filter(f.col('index') >2)
    .select(
        'CUSTOMER_ID'
        , 'c.fs_cust_id'
        , 'ifp_score'
        , 'ifp_segment'
        , 'ifp_top_ntile'
        , 'dr_score'
        , 'dr_segment'
        , 'dr_top_ntile'
        , 'churn_score'
        , 'churn_segment'
        , 'churn_top_ntile'
    )
 )



In [0]:
display(
    df_email_output
    #.filter(f.col("target_segment") == 'a.target')
    .groupBy("ifp_segment")
    .agg(f.countDistinct("fs_cust_id").alias("acct"))
    .withColumn(
        "acct_tot"
        , f.sum("acct").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("acct") / f.col("acct_tot")
    )
)

display(
    df_email_output
    #.filter(f.col("target_segment") == 'a.target')
    .groupBy("dr_segment")
    .agg(f.countDistinct("fs_cust_id").alias("acct"))
    .withColumn(
        "acct_tot"
        , f.sum("acct").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("acct") / f.col("acct_tot")
    )
)


display(
    df_email_output
    #.filter(f.col("target_segment") == 'a.target')
    .groupBy("churn_segment")
    .agg(f.countDistinct("fs_cust_id").alias("acct"))
    .withColumn(
        "acct_tot"
        , f.sum("acct").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("acct") / f.col("acct_tot")
    )
)



display(
    df_email_output
    .agg(
        f.countDistinct("fs_cust_id")
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("dr_score")
        , f.avg("dr_score")
        , f.median("ifp_score")
        , f.avg("ifp_score")
    )
)

### s102 global control

In [0]:
df_gc_curr = spark.read.format("delta").load("/mnt/ml-store-dev/dev_users/dev_el/marketing_programs/global_control/mobile_oa_consumer")

df_gc_curr = (
    df_gc_curr
    .select("fs_cust_id")
    .distinct()
)

print(df_gc_curr.count())

display(
    df_gc_curr
    .limit(10)
)

## s200 Data processing

### s200 base candidate

In [0]:
df_base_full = (
    df_fs_master
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
)

display(
    df_base_full
    .limit(100)
)

display(
    df_base_full
    .count()
) 

### s202 exclusion flag

In [0]:
# current global control
df_tmp_excl_01 = (
    df_gc_curr
    .select("fs_cust_id")
    .distinct()
    .withColumn(
        "gc_curr_flag"
        , f.lit('Y')
    )
)

# df_tmp_excl_03 = (
#     df_wallet_program_control
#     .select("fs_srvc_id_sha2")
#     .distinct()
#     .withColumn(
#         "wpc_flag"
#         , f.lit('Y')
#     )
# )

# exclude one one grade 
df_tmp_excl_04 = (
    df_one_upgrade
    .select('CUSTOMER_ID')
    .distinct()
    .withColumn(
        'one_upgrade_flag'
        , f.lit('Y')
    )
    .withColumnRenamed('CUSTOMER_ID', 'fs_cust_id')
)


In [0]:
display(
    df_base_full
    .groupBy('network_dvc_brand')
    .agg(
        f.countDistinct('fs_cust_id')
        , f.countDistinct('fs_srvc_id')
    )   
)   

In [0]:
df_apple_cust = (
    df_base_full
    .withColumn(
        "network_dvc_brand_std"
        , f.when(
            f.lower(f.col("network_dvc_brand")).isin(["apple"])
            , f.lower(f.col("network_dvc_brand"))
        ).otherwise(
            f.lit("others")
        )
    )
    .groupBy('fs_cust_id')
    .agg(
        f.countDistinct('fs_srvc_id').alias('distinct_services'),
        f.max(f.when(f.col('network_dvc_brand_std') == 'apple', 1).otherwise(0)).alias('has_apple')
    )
)


In [0]:
df_proc_full = (
    df_base_full
    .select(
        "reporting_date"
        , "reporting_cycle_type"
        , "fs_cust_id"
    )
    .distinct()
    .join(
        df_base_score_ifp
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_base_score_dr
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_base_score_churn
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_base_score_ar
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_tmp_excl_01
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_tmp_excl_04
        , ["fs_cust_id"]
        , "left"
    )
    .join(
        df_apple_cust
        , ['fs_cust_id']
        , 'left'
    )
    .join(df_email_output.alias('a')
          .select('fs_cust_id')
          .distinct()
          , ['fs_cust_id']
          , 'left'
          )
    .fillna(
        value='N'
        , subset=['gc_curr_flag', 'one_upgrade_flag']
    )
    .withColumn(
        "target_segment"
        , f.when(
            f.col('a.fs_cust_id').isNotNull()
           , 'a.target'
        )
        .when(
            (
                (f.col("gc_curr_flag") == 'Y')
            )
            , f.lit("z2.global control - curr")
        )
        .when(
            (
                (f.col("one_upgrade_flag") == 'Y')
            )
            , f.lit("z4.one_upgrade")
        )
        .when(
            (
                f.col("risk_top_ntile") >= 98
            )
            , f.lit("z6.High Bad Debt Risk")
        )
       .when(
           (
               f.col('has_apple') == 1
           )
           , f.lit('z7.Apple Device')
       )
        .otherwise(f.lit("a.misc"))
    )
    .withColumn("churn_top_ntile", f.ntile(30).over(Window.orderBy(f.desc("churn_score"))))
    .withColumn("dr_top_ntile", f.ntile(30).over(Window.orderBy(f.desc("dr_score"))))
    .withColumn("ifp_top_ntile", f.ntile(30).over(Window.orderBy(f.desc("ifp_score"))))
)

In [0]:
display(
    df_proc_full
    .groupBy("target_segment")
    .agg(
        f.count("*")
        , f.countDistinct("fs_cust_id")
    )
    .orderBy("target_segment")
)

display(df_proc_full.limit(100))
display(df_proc_full.count())

In [0]:
df_campaign_full = (
    df_proc_full
    .withColumn(
        "ifp_rank"
        , f.row_number().over(
            Window
            .partitionBy(f.lit(1))
            .orderBy(f.desc("ifp_score"))
        )
    )
    .withColumn(
        "dr_rank"
        , f.row_number().over(
            Window
            .partitionBy(f.lit(1))
            .orderBy(f.desc("dr_score"))
        )
    )
    .withColumn(
        "churn_rank"
        , f.row_number().over(
            Window
            .partitionBy(f.lit(1))
            .orderBy(f.desc("churn_score"))
        )
    )
    # base on requirement of 1. samsumg/android phone 2. not in one upgrade 
    #.filter(f.col("one_upgrade_flag") != "Y")
    #.filter(f.col('network_dvc_os') == 'Android')
)


In [0]:
display(
    df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .groupBy("churn_segment")
    .agg(f.countDistinct("fs_cust_id").alias("conn"))
    .withColumn(
        "conn_tot"
        , f.sum("conn").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("conn") / f.col("conn_tot")
    )
)

display(
    df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .groupBy("dr_segment")
    .agg(f.countDistinct("fs_cust_id").alias("conn"))
    .withColumn(
        "conn_tot"
        , f.sum("conn").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("conn") / f.col("conn_tot")
    )
)

display(
    df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    .groupBy("ifp_segment")
    .agg(f.countDistinct("fs_cust_id").alias("conn"))
    .withColumn(
        "conn_tot"
        , f.sum("conn").over(Window.partitionBy(f.lit(1)))
    )
    .withColumn(
        "pct"
        , f.col("conn") / f.col("conn_tot")
    )
)

In [0]:
(
    df_campaign_full
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/regen_control/250123_Samsung_NPI_Consumer_email') 
)

In [0]:
df_campaign_full = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/regen_control/250123_Samsung_NPI_Consumer_email')

### s206 local control - H

In [0]:
vt_param_control_size = 35000
ls_param_strata = ["dr_top_ntile", "ifp_top_ntile", "churn_top_ntile"]

df_sample_target = create_sample_target(
     df_campaign_full
    .filter(f.col("target_segment") == 'a.target')
    #.filter(f.col("wallet_eligibility_flag") == 'Y')
    , ls_param_strata
)

df_campaign_cand_control = find_similar_sample(
    df_campaign_full
    .filter(f.col("target_segment") != 'a.target')
    .filter(f.col("target_segment") != 'z7.Apple Device')
    .filter(f.col('target_segment')!= 'z4.one_upgrade')
    # .filter(f.col("ch01_flag") != 'Y')
    # .filter(f.col("ch02_flag") != 'Y')
    # .filter(f.col("ch03_flag") != 'Y')
    , size = vt_param_control_size
    , strata = ls_param_strata
    , df_target = df_sample_target
)


print("control")
display(
    df_campaign_cand_control
    .agg(
        f.countDistinct("fs_cust_id")
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("dr_score")
        , f.avg("dr_score")
        , f.median("ifp_score")
        , f.avg("ifp_score")
    )
)


print("target")
display(
    df_campaign_full
    .filter(f.col('target_segment') == 'a.target')
    .agg(
        f.countDistinct("fs_cust_id")
        , f.median("churn_score")
        , f.avg("churn_score")
        , f.median("dr_score")
        , f.avg("dr_score")
        , f.median("ifp_score")
        , f.avg("ifp_score")
    )
)

df_campaign_cand_target = (
    df_campaign_full
    .filter(f.col('target_segment') == 'a.target')
)

In [0]:
(
    df_campaign_cand_control
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/regen_control/250123_Samsung_NPI_Consumer_email_regen_control') 
)

In [0]:
(
    df_campaign_cand_target
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("partitionOverwriteMode", "dynamic")
    .save('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/regen_control/250123_Samsung_NPI_Consumer_email_regen_target') 
)

In [0]:
df_campaign_cand_control = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/regen_control/250123_Samsung_NPI_Consumer_email_regen_control')
df_campaign_cand_target = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/ml_campaigns/regen_control/250123_Samsung_NPI_Consumer_email_regen_target')

In [0]:
display(
    df_campaign_cand_control
    .join(df_campaign_cand_target, on = ["fs_cust_id"], how = "inner")
)