In [0]:
from pyspark.sql import functions as f

In [0]:
password = dbutils.secrets.get(scope = "auea-kv-sbx-dxdtlprdct01", key = "sfdbrsdskey")
options = {
  "sfUrl": "vodafonenz_prod.australia-east.azure.snowflakecomputing.com", 
  "sfUser": "SVC_LAB_DS_DATABRICKS",
  "pem_private_key": password.replace('\\n', '\n'),
  "sfDatabase": "prod_pdb_masked",
  "sfSchema": "modelled",
  "sfWarehouse": "LAB_DS_WH"
}

In [0]:
df_fea_unit_base = spark.read.format('delta').load('/mnt/feature-store-prod-lab/d400_feature/d401_mobile_oa_consumer/fea_unit_base/reporting_cycle_type=rolling cycle')

In [0]:
df_bb_base = (
    spark
    .read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , """
        select * from lab_ml_store.sandbox.sc_consumer_bb_martech_base
        """
    )
    .load()
)

In [0]:
df_bb_order_activation = (
    spark
    .read
    .format("snowflake")
    .options(**options)
    .option(
        "query"
        , """
        select * from LAB_ML_STORE.SANDBOX.SC_CONSUMER_BB_MARTECH_ALL_CONNECTIONS_ORDERS
        """
    )
    .load()
)

In [0]:
display(
    df_bb_base
    .groupBy('cal_date')
    .agg(f.count('*')
         , f.countDistinct('service_id')
         , f.countDistinct('customer_id')
         , f.countDistinct('billing_account_number')
         ) 
)

In [0]:
display(
    df_fea_unit_base
    .filter(f.col('reporting_date') >= '2024-05-01')
    .join(
        df_bb_base
        .withColumn('bb_reporting_date', f.date_add(f.col('cal_date'), 1))
        .filter(f.col('bb_reporting_date') >= '2024-05-01')
        , (f.col('fs_cust_id') == f.col('customer_id') )
           & (f.col('reporting_date') == f.col('bb_reporting_date'))
        , 'anti')
    .groupBy('reporting_date')
    .agg(
        f.count('*')
        , f.countDistinct('fs_srvc_id')
        , f.countDistinct('fs_cust_id')
        , f.countDistinct('fs_acct_id')
    )
)


In [0]:
display(
    df_fea_unit_base
    .filter(f.col('reporting_date') >= '2024-05-01')
    .groupBy('reporting_date')
    .agg(
        f.count('*')
        , f.countDistinct('fs_srvc_id')
        , f.countDistinct('fs_cust_id')
        , f.countDistinct('fs_acct_id')
    )
)

In [0]:
df_non_converged_cust = (
    df_fea_unit_base
    .filter(f.col('reporting_date') >= '2024-05-01')
    .join(
        df_bb_base
        .withColumn('bb_reporting_date', f.date_add(f.col('cal_date'), 1))
        .filter(f.col('bb_reporting_date') >= '2024-05-01')
        , (f.col('fs_cust_id') == f.col('customer_id') )
           & (f.col('reporting_date') == f.col('bb_reporting_date'))
        , 'anti')
)

In [0]:
(
    df_non_converged_cust
    .write
    .format("delta")
    # .partitionBy('reporting_date')
    .mode("overwrite")
    .option("overwriteSchema", "True")
    .save('/mnt/feature-store-dev/dev_users/dev_dw/d999_tmp/xsell_bb_base')
)

In [0]:
display(
    df_bb_order_activation
    .filter(f.col('connect_order_date') >= '2024-05-01')
    .groupBy('connect_order_date')
    .agg(
        f.countDistinct('service_id')
        , f.count('*')
        , f.countDistinct('customer_id')
    )   
)

In [0]:
df_activation_agg = (
    df_bb_order_activation
    .groupBy('customer_id')
    .agg(f.min('connect_order_date').alias('earliest_activation_date'))
)


df_non_converged_activation = (
    df_non_converged_cust
    .select(
        f.col('fs_cust_id').alias('customer_id')
        , 'reporting_date'
    )
    .distinct().alias('cust')
    .join(
        df_activation_agg.alias('act'),
        on='customer_id',
        how='inner'
    )
    .filter(f.col('act.earliest_activation_date') > f.col('cust.reporting_date'))
    .select(
        'customer_id',
        'reporting_date',
        'earliest_activation_date'
    )
    .orderBy('reporting_date', 'customer_id')
)


display(df_non_converged_activation)

In [0]:
df_activation_summary = (
    df_non_converged_activation
    .withColumn(
        'days_to_activation', 
        f.datediff(f.col('earliest_activation_date'), f.col('reporting_date'))
    )
    .withColumn(
        'activated_within_1_month', 
        f.when(f.col('days_to_activation') <= 30, 1).otherwise(0)
    )
    .groupBy('reporting_date')
    .agg(
        f.countDistinct('customer_id').alias('total_customers'),
        f.sum(
            f.col('activated_within_1_month').cast('integer')
        ).alias('activated_within_1_month')
    )
)

In [0]:
display(df_activation_summary)

# 1.5k/ 423K = 0.35%  organic rate per months? 

In [0]:
from pyspark.sql import functions as f

# Step 1: Define weekly window start and end dates in df_non_converged_cust
df_non_converged_cust_week = (
    df_non_converged_cust
    .withColumn("week_end", f.col("reporting_date"))
    .withColumn("week_start", f.date_add(f.col("reporting_date"), -6))
    .select("week_start", "week_end", f.col("fs_cust_id").alias('customer_id'))
    .distinct()
)

# Step 2: Join activations to weekly windows
df_weekly_activations = (
    df_bb_order_activation
    .groupBy('customer_id')
    .agg(f.min('connect_order_date').alias('earliest_activation_date'))
    .alias('act')
    .join(
        df_non_converged_cust_week.alias('cust_week'),
        (f.col('act.earliest_activation_date') >= f.col('cust_week.week_start')) &
        (f.col('act.earliest_activation_date') <= f.col('cust_week.week_end')) & 
        (f.col('act.customer_id') == f.col('cust_week.customer_id'))
        ,
        how='inner'
    )
    .groupBy('cust_week.week_start', 'cust_week.week_end')
    .agg(f.count('*').alias('activation_count'))
    .orderBy('cust_week.week_end')
)

# Display results
display(df_weekly_activations)
