# Data Warehouse Truven QA - Pharmacy Claims

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [2]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [3]:
connection = psycopg2.connect(get_dsn())
connection.autocommit = True

## Row Counts and Claim Counts

In [12]:
query = ''' drop table if exists qa_reporting.dw_truv_pharmacy_claims_counts;
create table qa_reporting.dw_truv_pharmacy_claims_counts
(
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [17]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_pharmacy_claims_counts
    (calendar_year, table_src, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, table_id_src, count(*), count(distinct uth_rx_claim_id), current_date
    from dw_staging.truv_pharmacy_claims a
    group by 1,2
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_pharmacy_claims_counts b
    set dw_src_clm_id_count = count
    from (
        select year, table_id_src, count(distinct rx_claim_id_src) as count 
        from dw_staging.truv_pharmacy_claims
    group by 1,2) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_pharmacy_claims_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, table_id_src, count(distinct uth_member_id) as count 
        from dw_staging.truv_pharmacy_claims
    group by 1,2) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_pharmacy_claims_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, table_id_src, count(distinct member_id_src) as count 
        from dw_staging.truv_pharmacy_claims
    group by 1,2) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

In [18]:
query = '''update qa_reporting.dw_truv_pharmacy_claims_counts a
    set src_row_count = b.row_count,
    row_count_diff = a.dw_row_count - b.row_count,
    row_count_diff_percentage = 100. * abs(a.dw_row_count - b.row_count) / b.row_count,
    src_clm_count = b.clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.clm_count) / b.clm_count,
    src_mbr_count = b.pat_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.pat_count,   
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from (select * from qa_reporting.truven_counts where table_name in('ccaed', 'mdcrd')) b
    where a.calendar_year = b.year
    and a.table_src = b.table_name
    ;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [19]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_pharmacy_claims_counts
    (calendar_year, table_src, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, 'ALL', count(*), count(distinct uth_rx_claim_id), current_date
    from dw_staging.truv_pharmacy_claims a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_pharmacy_claims_counts b
    set dw_src_clm_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct rx_claim_id_src) as count 
        from dw_staging.truv_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_pharmacy_claims_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct uth_member_id) as count 
        from dw_staging.truv_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_pharmacy_claims_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct member_id_src) as count 
        from dw_staging.truv_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

In [20]:
with connection.cursor() as cursor:
    # Note that this query will run for a long time
    query = '''
    with truven_claims as (
            select year, enrolid, rx_id_src
            from staging_clean.ccaed_etl
            union all
            select year, enrolid, rx_id_src
            from staging_clean.mdcrd_etl
        ),
        truven_claims_counts as (
            select year as calendar_year, count(*) as src_row_count, 
                    count(distinct enrolid) src_mbr_count,
                    count(distinct rx_id_src) src_clm_count
            from truven_claims
            group by 1
        )
    update qa_reporting.dw_truv_pharmacy_claims_counts a
    set src_row_count = b.src_row_count,
    row_count_diff = a.dw_row_count - b.src_row_count,
    row_count_diff_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
    src_clm_count = b.src_clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
    src_mbr_count = b.src_mbr_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from truven_claims_counts b
    where a.calendar_year = b.calendar_year
    and a.table_src = 'ALL'
    ;
    '''

    cursor.execute(query)

In [4]:
pd.read_sql('select * from qa_reporting.dw_truv_pharmacy_claims_counts', con=connection).sort_values(['row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage'])



Unnamed: 0,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
16,2012,mdcrd,90958557,90968863,-10306,0.011329,90305042,90305042,90305643,-601,0.000666,3153160,3153160,3153272,-112,0.003552,2023-05-16
9,2014,mdcrd,71149754,71160123,-10369,0.014571,70640973,70640973,70641795,-822,0.001164,2454436,2454436,2454536,-100,0.004074,2023-05-16
3,2020,mdcrd,43890802,43901901,-11099,0.025281,43155515,43155515,43155669,-154,0.000357,1563445,1563445,1563493,-48,0.00307,2023-05-16
18,2011,mdcrd,98910456,98938042,-27586,0.027882,98365540,98365540,98371093,-5553,0.005645,3398403,3398403,3399703,-1300,0.038239,2023-05-16
34,2015,mdcrd,57078293,57094378,-16085,0.028173,56680005,56680005,56681262,-1257,0.002218,1883295,1883295,1883600,-305,0.016192,2023-05-16
12,2019,mdcrd,43555351,43567923,-12572,0.028856,42628479,42628479,42628776,-297,0.000697,1504987,1504987,1505037,-50,0.003322,2023-05-16
28,2013,mdcrd,81874944,81898656,-23712,0.028953,81584012,81584012,81589584,-5572,0.006829,2881311,2881311,2881673,-362,0.012562,2023-05-16
7,2021,mdcrd,34983626,34994521,-10895,0.031133,34030793,34030793,34030942,-149,0.000438,1269339,1269339,1269364,-25,0.001969,2023-05-16
0,2018,mdcrd,30639800,30650133,-10333,0.033713,29537028,29537028,29537128,-100,0.000339,1018553,1018553,1018570,-17,0.001669,2023-05-16
31,2016,mdcrd,56382355,56401796,-19441,0.034469,55715314,55715314,55716457,-1143,0.002051,1852291,1852291,1852430,-139,0.007504,2023-05-16


## NDC

In [5]:
query = '''drop table if exists qa_reporting.dw_truv_ndc_count;
select year, ndc, count(*)
into qa_reporting.dw_truv_ndc_count
from dw_staging.truv_pharmacy_claims
group by 1,2;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [6]:
ndc_df = pd.read_sql('select * from qa_reporting.dw_truv_ndc_count;', con=connection)
ndc_df



Unnamed: 0,year,ndc,count
0,2017,55150012824,33
1,2018,00406996001,4494
2,2019,00527141910,2
3,2017,65162073403,16
4,2015,00074327156,137
...,...,...,...
584992,2016,08222082589,68
584993,2022,58160088741,125
584994,2017,41167043205,130
584995,2021,65862046999,36133


In [7]:
ndc_df.groupby('year')['count'].sum()

year
2011    464249919
2012    453069472
2013    365414548
2014    373860736
2015    287501042
2016    292035770
2017    254534680
2018    240656938
2019    239457307
2020    221808948
2021    214767787
2022     97866074
Name: count, dtype: int64

In [8]:
query = '''
select a.*
from qa_reporting.dw_truv_ndc_count a
left join reference_tables.redbook b
on a.ndc = b.ndcnum
where b.ndcnum is null;
'''

missing_ndc_df = pd.read_sql(query, con=connection)
missing_ndc_df



Unnamed: 0,year,ndc,count
0,2018,11822351930,3
1,2015,00085314201,1
2,2014,43353065670,2
3,2017,20221026061,2
4,2012,11822506000,16
...,...,...,...
98830,2013,66424052601,17
98831,2014,60435188710,1
98832,2014,00038004210,2
98833,2022,70377003811,14


In [9]:
missing_ndc_df.groupby('year')['count'].sum()

year
2011     1582931
2012     1436879
2013     1110112
2014     1323451
2015      879724
2016      997994
2017      932100
2018      841266
2019     2194335
2020    10787949
2021    33023341
2022    16348768
Name: count, dtype: int64

In [10]:
ndc_comp_df = pd.DataFrame({'overall_count': ndc_df.groupby('year')['count'].sum(),
                            'invalid_count': missing_ndc_df.groupby('year')['count'].sum(),
                            'valid_count': ndc_df.groupby('year')['count'].sum() - missing_ndc_df.groupby('year')['count'].sum()})
ndc_comp_df.loc[ndc_comp_df['valid_count'].isna(),'valid_count'] = ndc_comp_df.loc[ndc_comp_df['valid_count'].isna(),'overall_count']
ndc_comp_df['valid_count'] =  ndc_comp_df['valid_count'].astype(int)
ndc_comp_df['invalid_to_valid_percent'] = 100. * ndc_comp_df['invalid_count'] / ndc_comp_df['valid_count']
ndc_comp_df  

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,464249919,1582931,462666988,0.342132
2012,453069472,1436879,451632593,0.318152
2013,365414548,1110112,364304436,0.304721
2014,373860736,1323451,372537285,0.355253
2015,287501042,879724,286621318,0.306929
2016,292035770,997994,291037776,0.342909
2017,254534680,932100,253602580,0.367544
2018,240656938,841266,239815672,0.350797
2019,239457307,2194335,237262972,0.924854
2020,221808948,10787949,211020999,5.112263


## Days Supply

In [11]:
query = '''drop table if exists qa_reporting.dw_truv_rx_days_supply;
create table qa_reporting.dw_truv_rx_days_supply
(
    year int,
    dw_min_days_supply numeric,
    dw_median_days_supply numeric,
    dw_max_days_supply numeric,
    dw_avg_days_supply numeric,
    src_min_days_supply numeric,
    src_median_days_supply numeric,
    src_max_days_supply numeric,
    src_avg_days_supply numeric,
    min_days_supply_diff numeric,
    median_days_supply_diff numeric,
    max_days_supply_diff numeric,
    avg_days_supply_diff numeric
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
with connection.cursor() as cursor:
    query = '''
    insert into qa_reporting.dw_truv_rx_days_supply
    (year, dw_min_days_supply)
    select year, min(days_supply)
    from dw_staging.truv_pharmacy_claims
    group by 1
    '''

    cursor.execute(query)


    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set dw_median_days_supply = n
    from (select year, median(days_supply) n
    from dw_staging.truv_pharmacy_claims
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set dw_max_days_supply = n
    from (select year, max(days_supply) n
    from dw_staging.truv_pharmacy_claims
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set dw_avg_days_supply = n
    from (select year, avg(days_supply) n
    from dw_staging.truv_pharmacy_claims
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)

In [14]:
with connection.cursor() as cursor:
    query ='''drop table if exists staging_clean.truven_rx_daysupp;

create table staging_clean.truven_rx_daysupp
with (
appendonly=true, 
orientation=row, 
compresstype=zlib, 
compresslevel=5 
)
as 
select year, enrolid, (enrolid::text || ndcnum::text || svcdate::text) as rx_id_derv, daysupp
from truven.ccaed
distributed by (rx_id_derv);
    '''

    cursor.execute(query)

    query = '''insert into staging_clean.truven_rx_daysupp
select year, enrolid, (enrolid::text || ndcnum::text || svcdate::text) as rx_id_derv, daysupp
from truven.mdcrd;
    '''

    cursor.execute(query)

    query= '''vacuum analyze staging_clean.truven_rx_daysupp;'''

    cursor.execute(query)

In [15]:
with connection.cursor() as cursor:
    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set src_min_days_supply = n
    from (select year, min(daysupp) n
    from staging_clean.truven_rx_daysupp
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)


    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set src_median_days_supply = n
    from (select year,  median(daysupp) n
    from staging_clean.truven_rx_daysupp
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set src_max_days_supply = n
    from (select year,  max(daysupp) n
    from staging_clean.truven_rx_daysupp
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_truv_rx_days_supply a
    set src_avg_days_supply = n
    from (select year,  avg(daysupp) n
    from staging_clean.truven_rx_daysupp
    group by 1) b
    where a.year = b.year;
    '''

    cursor.execute(query)

In [16]:
with connection.cursor() as cursor:
    query = '''
    update qa_reporting.dw_truv_rx_days_supply
    set min_days_supply_diff = dw_min_days_supply - src_min_days_supply,
    median_days_supply_diff = dw_median_days_supply - src_median_days_supply,
    max_days_supply_diff = dw_max_days_supply - src_max_days_supply,
    avg_days_supply_diff = dw_avg_days_supply - src_avg_days_supply
    ;
    '''

    cursor.execute(query)

In [17]:
df = pd.read_sql('select * from qa_reporting.dw_truv_rx_days_supply;', con=connection)
df



Unnamed: 0,year,dw_min_days_supply,dw_median_days_supply,dw_max_days_supply,dw_avg_days_supply,src_min_days_supply,src_median_days_supply,src_max_days_supply,src_avg_days_supply,min_days_supply_diff,median_days_supply_diff,max_days_supply_diff,avg_days_supply_diff
0,2018,-180.0,30.0,999.0,36.952436,-180.0,30.0,999.0,36.950307,0.0,0.0,0.0,0.002129
1,2020,-180.0,30.0,999.0,41.458489,-180.0,30.0,999.0,41.457819,0.0,0.0,0.0,0.00067
2,2021,-999.0,30.0,999.0,39.330058,-999.0,30.0,999.0,39.330975,0.0,0.0,0.0,-0.000917
3,2014,-180.0,30.0,999.0,36.295446,-180.0,30.0,999.0,36.293658,0.0,0.0,0.0,0.001788
4,2019,-365.0,30.0,999.0,38.992063,-365.0,30.0,999.0,38.990848,0.0,0.0,0.0,0.001216
5,2012,-999.0,30.0,999.0,35.883689,-999.0,30.0,999.0,35.88159,0.0,0.0,0.0,0.002099
6,2011,-909.0,30.0,999.0,35.836124,-909.0,30.0,999.0,35.832775,0.0,0.0,0.0,0.003349
7,2022,-999.0,30.0,999.0,40.572392,-999.0,30.0,999.0,40.57292,0.0,0.0,0.0,-0.000528
8,2017,-365.0,30.0,999.0,37.879156,-365.0,30.0,999.0,37.874764,0.0,0.0,0.0,0.004392
9,2013,-180.0,30.0,999.0,36.996464,-180.0,30.0,999.0,36.99416,0.0,0.0,0.0,0.002304


In [18]:
df[['year', 'min_days_supply_diff', 'median_days_supply_diff', 'max_days_supply_diff', 'avg_days_supply_diff']]

Unnamed: 0,year,min_days_supply_diff,median_days_supply_diff,max_days_supply_diff,avg_days_supply_diff
0,2018,0.0,0.0,0.0,0.002129
1,2020,0.0,0.0,0.0,0.00067
2,2021,0.0,0.0,0.0,-0.000917
3,2014,0.0,0.0,0.0,0.001788
4,2019,0.0,0.0,0.0,0.001216
5,2012,0.0,0.0,0.0,0.002099
6,2011,0.0,0.0,0.0,0.003349
7,2022,0.0,0.0,0.0,-0.000528
8,2017,0.0,0.0,0.0,0.004392
9,2013,0.0,0.0,0.0,0.002304


In [19]:
connection.cursor().execute('drop table if exists staging_clean.truven_rx_daysupp;')

## Dispense as Written

In [20]:
query = '''drop table if exists qa_reporting.dw_truv_rx_daw_counts;
create table qa_reporting.dw_truv_rx_daw_counts
(
    year int,
    table_src text,
    dispensed_as_written text,
    dw_count int,
    src_count int,
    count_diff int,
    count_diff_percent numeric
)
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [22]:
query = '''
insert into qa_reporting.dw_truv_rx_daw_counts
(year, table_src, dispensed_as_written, dw_count)
select year, table_id_src, dispensed_as_written, count(*)
from dw_staging.truv_pharmacy_claims
group by 1,2,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [23]:
query = '''
insert into qa_reporting.dw_truv_rx_daw_counts
(year, table_src, dispensed_as_written, dw_count)
select year, 'ALL', dispensed_as_written, count(*)
from dw_staging.truv_pharmacy_claims
group by 1,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [24]:
query = '''
with rx_clm as (
    select year, 'mdcrd' as table_src, case when dawind is null then '00' else dawind end as dawind
    from staging_clean.mdcrd_etl
    union all
    select year, 'ccaed', case when dawind is null then '00' else dawind end as dawind
    from staging_clean.ccaed_etl
),
rx_daw_count as (
    select year, table_src, dawind, count(*)
    from rx_clm
    group by 1,2,3
    union
    select year, 'ALL', dawind, count(*)
    from rx_clm
    group by 1,3
)
update qa_reporting.dw_truv_rx_daw_counts a
set src_count = b.count,
count_diff = dw_count - b.count,
count_diff_percent = 100. * abs(dw_count - b.count) / b.count
from rx_daw_count b
where a.year = b.year
and a.dispensed_as_written = b.dawind
and a.table_src = b.table_src;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
df = pd.read_sql('select * from qa_reporting.dw_truv_rx_daw_counts', con=connection).sort_values('count_diff_percent')
df#[~df['src_count'].isna()]



Unnamed: 0,year,table_src,dispensed_as_written,dw_count,src_count,count_diff,count_diff_percent
114,2014,mdcrd,10,1405,1405,0,0.000000
244,2022,mdcrd,00,485932,485932,0,0.000000
351,2016,mdcrd,08,5439,5439,0,0.000000
239,2022,mdcrd,08,936,936,0,0.000000
236,2022,mdcrd,05,865,865,0,0.000000
...,...,...,...,...,...,...,...
274,2017,ccaed,07,17913,17941,-28,0.156067
141,2019,ccaed,00,3277915,3283596,-5681,0.173012
43,2020,ALL,00,4564673,4578681,-14008,0.305940
46,2020,ccaed,00,3718223,3731301,-13078,0.350494
