In [0]:
# libraries
import os

import pyspark
from pyspark import sql
from pyspark.sql import functions as f
from pyspark.sql import Window

In [0]:
# ------------- Use snowflake utility
sfUtils = sc._jvm.net.snowflake.spark.snowflake.Utils

# ------------ login to snowflake
password = dbutils.secrets.get(scope = "auea-kv-sbx-dxdtlprdct01", key = "sfdbrsdskey")

options = {
  "sfUrl": "vodafonenz_prod.australia-east.azure.snowflakecomputing.com/", 
  "sfUser": "SVC_LAB_DS_DATABRICKS",
  "pem_private_key": password.replace('\\n', '\n'),
  "sfDatabase": "LAB_ML_STORE",
  "sfSchema": "SANDBOX",
  "sfWarehouse": "LAB_DS_WH"
}

In [0]:
dir_fs_data_parent = "/mnt/feature-store-prod-lab"
dir_mls_data_parent = "/mnt/ml-store-prod-lab/classification"

In [0]:
dir_mls_data_score = os.path.join(dir_mls_data_parent, "d400_model_score")

In [0]:
dir_fs_data_meta = os.path.join(dir_fs_data_parent, 'd000_meta')
dir_fs_data_raw =  os.path.join(dir_fs_data_parent, 'd100_raw')
dir_fs_data_int =  os.path.join(dir_fs_data_parent, "d200_intermediate")
dir_fs_data_prm =  os.path.join(dir_fs_data_parent, "d300_primary")
dir_fs_data_fea =  os.path.join(dir_fs_data_parent, "d400_feature")
dir_fs_data_target = os.path.join(dir_fs_data_parent, "d500_movement")
dir_fs_data_serv = os.path.join(dir_fs_data_parent, "d600_serving")

In [0]:
vt_param_reporting_date = "2024-06-30"
vt_param_reporting_cycle_type = "rolling cycle"

In [0]:
#df_mls_score_dr_apple = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_apple_pred30d"))
#df_mls_score_dr_apple = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_apple_pred30d"))
df_mls_score_dr = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_device_replacement_pred30d"))
df_mls_score_ifp = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_upsell_ifp_pred30d"))
df_mls_score_churn = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_churn_pred30d"))
df_mls_score_ar = spark.read.format("delta").load(os.path.join(dir_mls_data_score, "mobile_oa_consumer_srvc_writeoff_pred120d"))

# ifp mvnt
df_ifp_srvc_mvnt = spark.read.format("delta").load('/mnt/feature-store-prod-lab/d500_movement/d501_mobile_oa_consumer/mvmt_ifp_upsell_on_service')
df_ifp_bill_mvnt = spark.read.format("delta").load('/mnt/feature-store-prod-lab/d500_movement/d501_mobile_oa_consumer/mvmt_ifp_upsell_on_bill')


In [0]:
df_samsung_clearance = (
    spark.read.format('csv') 
    .option('header', 'true') 
    .load('dbfs:/FileStore/mnt/ml-lab/dev_users/dev_sc/240703_RM_MOBPM_Samsung_S23_Experiment_SEND.csv')
)


In [0]:
display(df_samsung_clearance.limit(10))

display(
    df_samsung_clearance
    .agg(
        f.countDistinct('Service_ID')
        ,  f.count('*')
        )        
)

In [0]:
display(df_ifp_bill_mvnt
        #.select('ifp_type', 'ifp_model')
        .filter(f.col('ifp_type') == 'device')
        .filter(f.col('movement_date').between('2024-07-03', '2024-07-31'))
        .filter(f.col('ifp_model') == 'Samsung Galaxy S23 FE' )
        .groupBy('ifp_type', 'ifp_model')
        .agg(f.countDistinct('fs_ifp_id'))
        ) # Samsung Galaxy S23 FE

display(df_ifp_srvc_mvnt
        #.select('ifp_type', 'ifp_model')
        .filter(f.col('ifp_type') == 'device')
        .filter(f.col('movement_date').between('2024-07-03', '2024-07-31'))
        .filter(f.col("ifp_model").rlike("(?i)samsung.*fe"))
        .groupBy('ifp_type', 'ifp_model')
        .agg(f.countDistinct('fs_ifp_id'))
        ) # Samsung Galaxy S23 FE   

In [0]:
# query_sfmc = """
#            select * from PROD_MAR_TECH.SERVING.SFMC_EMAIL_PERFORMANCE
#             where campaignname in ('240703-RM-MOBPM-Samsung-S23-Experiment')
#         """

# df_sfmc = (
#     spark.read
#     .format("snowflake")
#     .options(**options)
#     .option(
#         "query"
#         , query_sfmc
#     )
#     .load()
# )

In [0]:
df_ifp_cust=(
        df_ifp_bill_mvnt
            .filter(f.col('movement_date').between('2024-07-03', '2024-07-31'))
            .filter(f.col('ifp_type') == 'device')
            .filter(f.col('ifp_model') == 'Samsung Galaxy S23 FE' )
            .select('fs_cust_id')
            .distinct()
        .union(
        df_ifp_srvc_mvnt
            .filter(f.col('movement_date').between('2024-07-03', '2024-07-31'))
            .filter(f.col('ifp_type') == 'device')
            .filter(f.col("ifp_model").rlike("(?i)samsung.*fe"))
            .select('fs_cust_id', 'fs_srvc_id')
            .distinct()
        )
)

In [0]:
df_base_score_ifp = (
    df_mls_score_ifp
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id', 'fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    # .filter(f.col('rank_acct')==1)
    # .withColumn(
    #     'rank_cust'
    #     , f.row_number().over(
    #         Window
    #         .partitionBy('fs_cust_id')
    #         .orderBy(f.desc('propensity_top_ntile'))
    #     )
    # )
    # .filter(f.col('rank_cust')==1)
    .select(
        "fs_cust_id"
        , "fs_acct_id"
        , "fs_srvc_id"
        , f.col("propensity_score").alias("ifp_score")
        , f.col("propensity_segment_qt").alias("ifp_segment")
        , f.col("propensity_top_ntile").alias("ifp_top_ntile")
    )
   
)

df_base_score_dr = (
    df_mls_score_dr
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .withColumn(
        'rank_acct'
        , f.row_number().over(
            Window
            .partitionBy('fs_cust_id','fs_acct_id')
            .orderBy(f.desc('propensity_top_ntile'))
        )
    )
    # .filter(f.col('rank_acct')==1)
    # .withColumn(
    #     'rank_cust'
    #     , f.row_number().over(
    #         Window
    #         .partitionBy('fs_cust_id')
    #         .orderBy(f.desc('propensity_top_ntile'))
    #     )
    # )
    # .filter(f.col('rank_cust')==1)
    .select(
        "fs_cust_id"
        , "fs_acct_id"
        , "fs_srvc_id"
        , f.col("propensity_score").alias("dr_score")
        , f.col("propensity_segment_qt").alias("dr_segment")
        , f.col("propensity_top_ntile").alias("dr_top_ntile")
    )   
)

df_base_score_churn = (
    df_mls_score_churn
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .select(
        "fs_cust_id"
        , "fs_acct_id"
        , "fs_srvc_id"
        , f.col("propensity_score").alias("churn_score")
        , f.col("propensity_segment_qt").alias("churn_segment")
        , f.col("propensity_top_ntile").alias("churn_top_ntile")
    )
)

df_base_score_ar = (
    df_mls_score_ar
    .filter(f.col("reporting_date") == vt_param_reporting_date)
    .filter(f.col("reporting_cycle_type") == vt_param_reporting_cycle_type)
    .select(
        "fs_cust_id"
        , "fs_acct_id"
        , "fs_srvc_id"
        , f.col("propensity_score").alias("risk_score")
        , f.col("propensity_segment_qt").alias("risk_segment")
        , f.col("propensity_top_ntile").alias("risk_top_ntile")
    )
)

In [0]:
display(
    df_samsung_send.alias('a')
    .join(df_ifp_cust.alias('b'), f.col('a.CUSTOMER_ID') == f.col('b.fs_cust_id'), 'left')
    .withColumn(
        'redeem',
        f.when(f.col('b.fs_cust_id').isNotNull(), 1)
        .otherwise(0)
    )
    .groupBy('redeem')
    .agg(f.countDistinct('CUSTOMER_ID').alias('cnt')
         , f.count('*')
         )
    .withColumn(
        'sum', f.sum('cnt').over(Window.partitionBy())
    )
    .withColumn(
        'pct'
        , f.col('cnt') / f.col('sum') *100
    )
)  


#72? 
# ~ 0.51% overall 

In [0]:
df_output=(
    df_sfmc.alias('a')
    .join(df_ifp_cust.alias('b'), f.col('a.CUSTOMER_ID') == f.col('b.fs_cust_id'), 'left')
    .withColumn(
        'redeem',
        f.when(f.col('b.fs_cust_id').isNotNull(), 1)
        .otherwise(0)
    )
    .join(df_base_score_ifp.alias('c'), f.col('a.CUSTOMER_ID') == f.col('c.fs_cust_id'), 'left')
    .join(df_base_score_dr.alias('d'), f.col('a.CUSTOMER_ID') == f.col('d.fs_cust_id'), 'left')
    .groupBy('dr_segment', 'ifp_segment')
    .agg(f.count('*').alias('cnt')
         , f.countDistinct('a.CUSTOMER_ID')
         , f.sum('redeem').alias('redeem_sum')
         )
)

display(df_output)

#72? 
# ~ 0.51% overall 

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
display(
    df_output
    .withColumn(
        'proensity_combine'
        , f.when(
            ~(
                (f.col('ifp_segment') == 'L')
                & (f.col('dr_segment') == 'L')
                ) , f.lit('H/M')
        )
        .otherwise('L')
    )
    .groupBy('proensity_combine')
    .agg(
        f.sum('cnt').alias('sum_cnt')
        , f.sum('redeem_sum').alias('sum')
    )
    .withColumn(
        'ttl_cnt'
        , f.sum('sum_cnt').over(Window.partitionBy())
    )
    .withColumn(
        'pct'
        , f.col('sum') / f.col('sum_cnt') * 100
    )
)