### envirment set up

In [0]:
import pyspark.sql.functions as f 
import pyspark
import os

### utility functions

In [0]:
%run "../../utility_functions/misc"

In [0]:
%run "../../utility_functions/qa_utils"

In [0]:
%run "../../utility_functions/spkdf_utils"

### directories

In [0]:
dir_danny_path = "dbfs:/mnt/feature-store-dev/dev_users/dev_dw/24q4_fs_fundation/"
dir_data_parent_shared = os.path.join(dir_danny_path, "d400_feature/d401_mobile_oa_consumer")
dir_data_parent_mvmt = os.path.join(dir_danny_path, "d500_movement/d501_mobile_oa_consumer")
dir_data_parent_stag = os.path.join(dir_danny_path, "d200_staging/d299_src")

In [0]:
df_fea_coll_action = spark.read.format('delta').load(os.path.join(dir_data_parent_shared ,'fea_coll_action_cycle_12mnth'))
df_fea_product_acq = spark.read.format('delta').load(os.path.join(dir_data_parent_shared, 'fea_product_acquisition_cycle_billing_12'))
df_fea_late_pay = spark.read.format('delta').load(os.path.join(dir_data_parent_shared,"fea_late_pay_cycle_billing_6"))
df_mvmt_aod = spark.read.format('delta').load(os.path.join(dir_data_parent_mvmt, 'mvmt_aod30d'))

# stage table 

df_stg_coll_action = spark.read.format('delta').load(os.path.join(dir_data_parent_stag, "stg_brm_coll_action_hist"))

In [0]:
df_qa_consec_late = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/99_misc/cohort_seg/fea_payment_behavior_v2')
df_qa_late_pay_6mp = spark.read.format('delta').load('/mnt/ml-lab/dev_users/dev_sc/99_misc/cohort_seg/fea_late_pay_6mp')


### QA

#### late pay

In [0]:
display(df_fea_late_pay.limit(10))

In [0]:
display(
    df_fea_late_pay
    .groupBy('reporting_date')
    .agg(f.countDistinct('fs_srvc_id')
         , f.count('*')
        # , f.count('action_name')
    )
)

In [0]:
ls_test_field_numeric = ['late_group_cnt_tot_6bmnth_p1', 'consecutive_late_pay_cnt_tot_6bmnth_p1', 'late_pay_cnt_tot_6bmnth_p1'
                         , 'overdue_days_avg_6bmnth_p1', 'late_group_cnt_tot_6bmnth', 'consecutive_late_pay_cnt_tot_6bmnth'
                         , 'late_pay_cnt_tot_6bmnth', 'overdue_days_avg_6bmnth'
]

ls_qa_feild_numric = ['cnt_late_pay_6bcycle_6mp', 'avg_overdue_days_6bcycle_6mp'
] 

In [0]:
display(df_qa_late_pay_6mp.limit(10))


In [0]:
for i in ls_test_field_numeric:     
    df_result = (
        df_fea_late_pay
        .groupBy('reporting_date')
        .agg(
            f.sum(i).alias('sum'),
            f.mean(i).alias('mean'),
            f.percentile_approx(i, 0.25,100).alias('25pct'),
            f.percentile_approx(i, 0.79,100).alias('75pct'),
            f.percentile_approx(i, 0.95,100).alias('95pct'),
            f.percentile_approx(i, 0.99,100).alias('99pct'),
            f.median(i).alias('median'),
            f.stddev(i).alias('stddev'),
            f.min(i).alias('min'),
            f.max(i).alias('max'), 
            f.countDistinct('fs_srvc_id'), 
            f.count('*') 
            #f.countDistinct('bill_no')
            )
    )
    print(i)
    display(df_result)

In [0]:
def create_bucket_analysis(df, field, num_buckets=10):
    # Calculate min and max for the field to create range buckets
    min_max = df.agg(f.min(field).alias('min'), f.max(field).alias('max')).collect()[0]
    min_val, max_val = min_max['min'], min_max['max']
    
    # Create window spec to calculate bucket ranges
    bucket_ranges = []
    for i in range(num_buckets + 1):
        bucket_val = min_val + (max_val - min_val) * (i / num_buckets)
        bucket_ranges.append(bucket_val)
    
    # Create buckets column
    bucket_expr = f.expr(f"""
        CASE 
        {' '.join([f"WHEN {field} >= {bucket_ranges[i]} AND {field} < {bucket_ranges[i+1]} THEN '{i*10}%-{(i+1)*10}%'" for i in range(num_buckets)])}
        ELSE '{num_buckets*10}%+'
        END
    """)
    
    # Perform analysis for each bucket
    df_result = (
        df
        .withColumn('bucket', bucket_expr)
        .groupBy('reporting_date', 'bucket')
        .agg(
            f.sum(field).alias('sum'),
            f.mean(field).alias('mean'),
            f.count('*').alias('total_count'),
            f.countDistinct('fs_srvc_id').alias('distinct_fs_srvc_id')
        )
    )
    
    return df_result

In [0]:
# Process each field for test data 
for field in ls_test_field_numeric:
    print(f"Analysis for field: {field}")
    df_result = create_bucket_analysis(df_fea_late_pay, field)
    display(df_result)

In [0]:
# Process each field in QA data 
for field in ls_qa_feild_numric:
    print(f"Analysis for field: {field}")
    df_result = create_bucket_analysis(df_qa_late_pay_6mp, field)
    display(df_result)

In [0]:
display(df_qa_consec_late.limit(10))

In [0]:
ls_qa_feild_numric = [ 'late_group_cnt', 'num_consecutive_late_pay', 'cnt_late_pay_6bcycle', 'avg_overdue_days_6bcycle']

In [0]:
# Process each field in QA data 
for field in ls_qa_feild_numric:
    print(f"Analysis for field: {field}")
    df_result = create_bucket_analysis(df_qa_consec_late, field)
    display(df_result)