### environment set up 

In [0]:
import pyspark.sql.functions as f 
import pyspark
import os

In [0]:
dbutils.fs.ls('/mnt/feature-store-dev/dev_shared/')

### utility functions


In [0]:
%run "../../utility_functions/misc"

In [0]:
%run "../../utility_functions/qa_utils"

In [0]:
%run "../../utility_functions/spkdf_utils"

In [0]:
def create_bucket_analysis(df, field, num_buckets=10):
    # Calculate min and max for the field to create range buckets
    min_max = df.agg(f.min(field).alias('min'), f.max(field).alias('max')).collect()[0]
    min_val, max_val = min_max['min'], min_max['max']
    
    # Create window spec to calculate bucket ranges
    bucket_ranges = []
    for i in range(num_buckets + 1):
        bucket_val = min_val + (max_val - min_val) * (i / num_buckets)
        bucket_ranges.append(bucket_val)
    
    # Create buckets column
    bucket_expr = f.expr(f"""
        CASE 
        {' '.join([f"WHEN {field} >= {bucket_ranges[i]} AND {field} < {bucket_ranges[i+1]} THEN '{i*10}%-{(i+1)*10}%'" for i in range(num_buckets)])}
        ELSE '{num_buckets*10}%+'
        END
    """)
    
    # Perform analysis for each bucket
    df_result = (
        df
        .withColumn('bucket', bucket_expr)
        .groupBy('reporting_date', 'bucket')
        .agg(
            f.sum(field).alias('sum'),
            f.mean(field).alias('mean'),
            f.count('*').alias('total_count'),
            f.countDistinct('fs_srvc_id').alias('distinct_fs_srvc_id')
        )
    )
    
    return df_result

### directories

In [0]:
dir_danny_path = "dbfs:/mnt/feature-store-dev/dev_users/dev_dw/24q4_fs_fundation/"
dir_data_parent_shared = os.path.join(dir_danny_path, "d400_feature/d401_mobile_oa_consumer")
dir_data_parent_mvmt = os.path.join(dir_danny_path, "d500_movement/d501_mobile_oa_consumer")
dir_data_parent_stag = os.path.join(dir_danny_path, "d200_staging/d299_src")

In [0]:
df_fea_coll_action = spark.read.format('delta').load(os.path.join(dir_data_parent_shared ,'fea_coll_action_cycle_12mnth'))
df_fea_product_acq = spark.read.format('delta').load(os.path.join(dir_data_parent_shared, 'fea_product_acquisition_cycle_billing_12'))
df_fea_late_pay = spark.read.format('delta').load(os.path.join(dir_data_parent_shared,"fea_late_pay_cycle_billing_6"))
df_mvmt_aod = spark.read.format('delta').load(os.path.join(dir_data_parent_mvmt, 'mvmt_aod30d'))


In [0]:
df_qa_prod_acq = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/99_misc/cohort_seg/fea_product_acq_hist')

In [0]:
display(df_fea_product_acq.limit(100))


#### converged flag

In [0]:
display(
        df_fea_product_acq
        .groupBy('reporting_date','converged_flag')
        .agg(
            f.countDistinct('fs_srvc_id')
             , f.count('*')
        )
)

Databricks visualization. Run in Databricks to view.

In [0]:
display(
    df_qa_prod_acq
    .groupBy('reporting_date','product_holding')
    .agg(
            f.countDistinct('fs_srvc_id')
             , f.count('*')
        )    
)

Databricks visualization. Run in Databricks to view.

In [0]:
display(df_qa_prod_acq
        .filter(f.col('reporting_date') == '2025-01-12')
        .filter(f.col('product_holding') == 'OA+BB')
        .join(
          df_fea_product_acq
          .filter(f.col('reporting_date') == '2025-01-12')
          .filter(f.col('converged_flag') == 'Y')  
          , ['fs_cust_id', 'fs_srvc_id', 'fs_acct_id', 'reporting_date']
          , 'anti'
        )
)

In [0]:
display(
    df_qa_prod_acq
    .filter(f.col('fs_acct_id') == '425310233')
)

In [0]:
display(
        df_fea_product_acq
        .filter(f.col('fs_acct_id') == '424364543')
)

In [0]:
display(df_fea_product_acq
        .filter(f.col('fs_acct_id') == '426483655')
        )

In [0]:
display(
    df_qa_prod_acq
    .filter(f.col('fs_acct_id') == '424364543')
)

#### bb_add_in_12mp

In [0]:
display(df_qa_prod_acq
        .groupBy('reporting_date','bb_add_in_12mp')
        .agg(
            f.countDistinct('fs_srvc_id')
             , f.count('*')
        )
)


print('dev ')
display(
    df_fea_product_acq
        .groupBy('reporting_date','bb_add_in_cnt_52week')
        .agg(
            f.countDistinct('fs_srvc_id')
             , f.count('*')
        )
)

In [0]:
display(
    df_qa_prod_acq
    .groupBy(f.date_format('bb_SERVICE_START_DATE_TIME', 'yyyyMM'))
    .agg(f.countDistinct('fs_srvc_id'))       
)


display(
    df_fea_product_acq
    .groupBy(f.date_format('bb_service_start_dttm', 'yyyyMM'))
    .agg(f.countDistinct('fs_srvc_id'))       
)



In [0]:
display(
    df_qa_prod_acq
    .withColumn(
        'service_start_month'
        , f.date_format('bb_SERVICE_START_DATE_TIME', 'yyyyMM')
    )
    .filter(f.col('service_start_month') == '202410')
    .join(
        df_fea_product_acq
        .withColumn('service_start_month', f.date_format('bb_service_start_dttm', 'yyyyMM'))
        # .filter(f.col('bb_service_start_dttm'))
        , ['fs_acct_id', 'fs_cust_id', 'fs_srvc_id', 'service_start_month'], 'anti'
    )
)

In [0]:
display(df_fea_product_acq
        .filter(f.col('fs_acct_id') == '393115964')
)

In [0]:
display(
    df_qa_prod_acq
    .filter(f.col('fs_acct_id') == '486275396')
    .select('reporting_date', 'reporting_cycle_type', 'fs_cust_id', 'fs_srvc_id', 'fs_acct_id'
            , 'bb_SERVICE_START_DATE_TIME'
            )
    .orderBy(f.desc('reporting_date'))
    #.filter(f.col('fs_cust_id') == '1-XSZ3HQJ')
)

In [0]:
display(
    df_fea_product_acq
    .filter(f.col('fs_acct_id') == '486275396')
    .select('reporting_date', 'reporting_cycle_type', 'fs_cust_id', 'fs_srvc_id', 'fs_acct_id'
            , 'bb_service_start_dttm'
            )
    .orderBy(f.desc('reporting_date'))
    #.filter(f.col('fs_cust_id') == '1-XSZ3HQJ')
)

#### cnt_product_add_in_12mp

In [0]:
display(df_qa_prod_acq
        .groupBy('reporting_date','cnt_product_add_in_12mp')
        .agg(
            f.countDistinct('fs_srvc_id')
             , f.count('*')
        )
)

print('dev')
display(
    df_fea_product_acq
        .groupBy('reporting_date','product_add_in_cnt_52week')
        .agg(
            f.countDistinct('fs_srvc_id')
             , f.count('*')
        )
)

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
display(
    df_qa_prod_acq
    .filter(f.col('reporting_date') =='2025-01-12')
    .filter(f.col('cnt_product_add_in_12mp') == 2)
    .join(
        df_fea_product_acq
        .filter(f.col('reporting_date') == '2025-01-12')
        .filter(f.col('product_add_in_cnt_52week') ==2)
        #.withColumn('service_start_month', f.date_format('bb_service_start_dttm', 'yyyyMM'))
        # .filter(f.col('bb_service_start_dttm'))
        , ['fs_acct_id', 'fs_cust_id', 'fs_srvc_id'], 'anti'
    )
)

In [0]:
display(
    df_fea_product_acq
    .filter(f.col('reporting_date') =='2025-01-12')
    .filter(f.col('product_add_in_cnt_52week') == 2)
    .join(
        df_qa_prod_acq
        .filter(f.col('reporting_date') == '2025-01-12')
        .filter(f.col('cnt_product_add_in_12mp') ==2)
        #.withColumn('service_start_month', f.date_format('bb_service_start_dttm', 'yyyyMM'))
        # .filter(f.col('bb_service_start_dttm'))
        , ['fs_acct_id', 'fs_cust_id', 'fs_srvc_id'], 'anti'
    )
)

In [0]:
display(
    df_fea_product_acq
    .filter(f.col('fs_acct_id') == '505563593')
    .select('reporting_date', 'fs_cust_id', 'fs_acct_id', 'fs_srvc_id', 'converged_flag')
    .orderBy(f.desc('reporting_date'))
)

In [0]:
display(
    df_qa_prod_acq
    .filter(f.col('fs_acct_id') == '505563593')  
    .orderBy(f.desc('reporting_date'))  
)

In [0]:
display(
    df_fea_product_acq
    .select('converged_flag', 'reporting_date', 'fs_acct_id', 'fs_cust_id', 'fs_srvc_id')
    .filter(f.col('fs_acct_id') == '453357237')
)

In [0]:
ls_test_fields_qa = ['cnt_product_add_in_12mp', 'ifp_add_in_12mp']

ls_test_fields_dev = ['product_add_in_cnt_12bmnth', 'ifp_add_in_cnt_12bmnth']

In [0]:
for field in ls_test_fields_dev:
    print(f"Analysis for field: {field}")
    df_result = create_bucket_analysis(df_fea_product_acq, field)
    display(df_result)

In [0]:
for field in ls_test_fields_qa:
    print(f"Analysis for field: {field}")
    df_result = create_bucket_analysis(df_qa_prod_acq, field)
    display(df_result)