# Data Warehouse Medicare Texas QA - Pharmacy Claims

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Counts and Claim Counts

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrt_pharmacy_claims_counts;
create table qa_reporting.dw_mcrt_pharmacy_claims_counts
(
    data_source bpchar(4),
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrt_pharmacy_claims_counts
    (calendar_year, table_src, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, 'ALL', count(*), count(distinct uth_rx_claim_id), current_date
    from dw_staging.mcrt_pharmacy_claims a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_pharmacy_claims_counts b
    set dw_src_clm_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct rx_claim_id_src) as count 
        from dw_staging.mcrt_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_pharmacy_claims_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct uth_member_id) as count 
        from dw_staging.mcrt_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mcrt_pharmacy_claims_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct member_id_src) as count 
        from dw_staging.mcrt_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

In [6]:
with connection.cursor() as cursor:
    # Note that this query will run for a long time
    query = '''
    with medicare_texas_claims as (
        select distinct extract(year from srvc_dt::date) as year, bene_id, pde_id
        from medicare_texas.pde_file
    ),
    medicare_texas_claims_counts as (
        select year as calendar_year, count(*) as src_row_count, 
                count(distinct bene_id) src_mbr_count,
                count(distinct pde_id) src_clm_count
        from medicare_texas_claims
        group by 1
    )
    update qa_reporting.dw_mcrt_pharmacy_claims_counts a
    set src_row_count = b.src_row_count,
    row_count_diff = a.dw_row_count - b.src_row_count,
    row_count_diff_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
    src_clm_count = b.src_clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
    src_mbr_count = b.src_mbr_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from medicare_texas_claims_counts b
    where a.calendar_year = b.calendar_year
    and a.table_src = 'ALL'
    ;
    '''

    cursor.execute(query)

In [7]:
pd.read_sql('select * from qa_reporting.dw_mcrt_pharmacy_claims_counts', con=connection).sort_values(['row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage'])



Unnamed: 0,data_source,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
0,,2014,ALL,89607685,89607685,0,0.0,89607685,89607685,89607685,0,0.0,2399008,2399008,2399008,0,0.0,2023-10-09
1,,2015,ALL,91774444,91774444,0,0.0,91774444,91774444,91774444,0,0.0,2531494,2531494,2531494,0,0.0,2023-10-09
2,,2016,ALL,94403709,94403709,0,0.0,94403709,94403709,94403709,0,0.0,2649268,2649268,2649268,0,0.0,2023-10-09
3,,2018,ALL,96513051,96513051,0,0.0,96513051,96513051,96513051,0,0.0,2895102,2895102,2895102,0,0.0,2023-10-09
4,,2019,ALL,98167357,98167357,0,0.0,98167357,98167357,98167357,0,0.0,3022696,3022696,3022696,0,0.0,2023-10-09
5,,2017,ALL,94259220,94259220,0,0.0,94259220,94259220,94259220,0,0.0,2739083,2739083,2739083,0,0.0,2023-10-09


## NDC

In [8]:
query = '''drop table if exists qa_reporting.dw_mcrt_ndc_count;
select data_source, year, ndc, count(*)
into qa_reporting.dw_mcrt_ndc_count
from dw_staging.mcrt_pharmacy_claims
group by 1,2,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [9]:
ndc_df = pd.read_sql('select * from qa_reporting.dw_mcrt_ndc_count;', con=connection)
ndc_df



Unnamed: 0,data_source,year,ndc,count
0,mcrt,2015,00378170477,27
1,mcrt,2019,00168000615,4287
2,mcrt,2016,00904630261,36
3,mcrt,2016,67253001106,752
4,mcrt,2019,00603421432,26
...,...,...,...,...
159815,mcrt,2018,87701040520,1
159816,mcrt,2018,00143144505,345
159817,mcrt,2016,00904649120,6
159818,mcrt,2019,16714081601,1400


In [10]:
ndc_df.groupby(['data_source', 'year'])['count'].sum()

data_source  year
mcrt         2014    89607685
             2015    91774444
             2016    94403709
             2017    94259220
             2018    96513051
             2019    98167357
Name: count, dtype: int64

In [11]:
query = '''
select a.*
from qa_reporting.dw_mcrt_ndc_count a
left join reference_tables.redbook b
on a.ndc = b.ndcnum
where b.ndcnum is null;
'''

missing_ndc_df = pd.read_sql(query, con=connection)
missing_ndc_df



Unnamed: 0,data_source,year,ndc,count
0,mcrt,2016,00363006510,1
1,mcrt,2018,11917011540,2
2,mcrt,2019,00363089688,1
3,mcrt,2017,50428133177,6
4,mcrt,2016,49348073010,2
...,...,...,...,...
3871,mcrt,2016,57515009565,32
3872,mcrt,2014,57515009565,14
3873,mcrt,2017,57515009565,18
3874,mcrt,2018,50428642751,4


In [12]:
missing_ndc_df.groupby('year')['count'].sum()

year
2014     20830
2015     15783
2016     17590
2017     22428
2018     55305
2019    648857
Name: count, dtype: int64

In [13]:
ndc_comp_df = pd.DataFrame({'overall_count': ndc_df.groupby('year')['count'].sum(),
                            'invalid_count': missing_ndc_df.groupby('year')['count'].sum(),
                            'valid_count': ndc_df.groupby('year')['count'].sum() - missing_ndc_df.groupby('year')['count'].sum()})
ndc_comp_df.loc[ndc_comp_df['valid_count'].isna(),'valid_count'] = ndc_comp_df.loc[ndc_comp_df['valid_count'].isna(),'overall_count']
ndc_comp_df['valid_count'] =  ndc_comp_df['valid_count'].astype(int)
ndc_comp_df['invalid_to_valid_percent'] = 100. * ndc_comp_df['invalid_count'] / ndc_comp_df['valid_count']
ndc_comp_df

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,89607685,20830,89586855,0.023251
2015,91774444,15783,91758661,0.017201
2016,94403709,17590,94386119,0.018636
2017,94259220,22428,94236792,0.0238
2018,96513051,55305,96457746,0.057336
2019,98167357,648857,97518500,0.665368


## Days Supply

In [14]:
query = '''drop table if exists qa_reporting.dw_mcrt_rx_days_supply;
create table qa_reporting.dw_mcrt_rx_days_supply
(
    data_source bpchar(4),
    year int,
    dw_min_days_supply numeric,
    dw_median_days_supply numeric,
    dw_max_days_supply numeric,
    dw_avg_days_supply numeric,
    src_min_days_supply numeric,
    src_median_days_supply numeric,
    src_max_days_supply numeric,
    src_avg_days_supply numeric,
    min_days_supply_diff numeric,
    median_days_supply_diff numeric,
    max_days_supply_diff numeric,
    avg_days_supply_diff numeric
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [15]:
with connection.cursor() as cursor:
    query = '''
    insert into qa_reporting.dw_mcrt_rx_days_supply
    (data_source, year, dw_min_days_supply)
    select data_source, year, min(days_supply)
    from dw_staging.mcrt_pharmacy_claims
    group by 1,2
    '''

    cursor.execute(query)


    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set dw_median_days_supply = n
    from (
        select data_source, year, median(days_supply) n
        from dw_staging.mcrt_pharmacy_claims
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set dw_max_days_supply = n
    from (
        select data_source, year, max(days_supply) n
        from dw_staging.mcrt_pharmacy_claims
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set dw_avg_days_supply = n
    from (
        select data_source, year, avg(days_supply) n
        from dw_staging.mcrt_pharmacy_claims
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

In [16]:
with connection.cursor() as cursor:
    query ='''drop table if exists dev.ip_medicare_texas_rx_daysupp;

create table dev.ip_medicare_texas_rx_daysupp
with (
appendonly=true, 
orientation=row, 
compresstype=zlib, 
compresslevel=5 
)
as 
select distinct 'mcrt' as data_source, extract(year from srvc_dt::date) as year, bene_id, pde_id, days_suply_num
from medicare_texas.pde_file
distributed by (pde_id);
    '''

    cursor.execute(query)

In [19]:
with connection.cursor() as cursor:
    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set src_min_days_supply = n
    from (
        select data_source, year, min(days_suply_num::numeric) n
        from dev.ip_medicare_texas_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)


    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set src_median_days_supply = n
    from (
        select data_source, year, median(days_suply_num::numeric) n
        from dev.ip_medicare_texas_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set src_max_days_supply = n
    from (
        select data_source, year, max(days_suply_num::numeric) n
        from dev.ip_medicare_texas_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply a
    set src_avg_days_supply = n
    from (
        select data_source, year, avg(days_suply_num::numeric) n
        from dev.ip_medicare_texas_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

In [20]:
with connection.cursor() as cursor:
    query = '''
    update qa_reporting.dw_mcrt_rx_days_supply
    set min_days_supply_diff = dw_min_days_supply - src_min_days_supply,
    median_days_supply_diff = dw_median_days_supply - src_median_days_supply,
    max_days_supply_diff = dw_max_days_supply - src_max_days_supply,
    avg_days_supply_diff = dw_avg_days_supply - src_avg_days_supply
    ;
    '''

    cursor.execute(query)

In [21]:
df = pd.read_sql('select * from qa_reporting.dw_mcrt_rx_days_supply;', con=connection)
df



Unnamed: 0,data_source,year,dw_min_days_supply,dw_median_days_supply,dw_max_days_supply,dw_avg_days_supply,src_min_days_supply,src_median_days_supply,src_max_days_supply,src_avg_days_supply,min_days_supply_diff,median_days_supply_diff,max_days_supply_diff,avg_days_supply_diff
0,mcrt,2019,0.0,30.0,999.0,48.999474,0.0,30.0,999.0,48.999474,0.0,0.0,0.0,0.0
1,mcrt,2017,0.0,30.0,999.0,45.354235,0.0,30.0,999.0,45.354235,0.0,0.0,0.0,0.0
2,mcrt,2018,0.0,30.0,999.0,47.170712,0.0,30.0,999.0,47.170712,0.0,0.0,0.0,0.0
3,mcrt,2015,0.0,30.0,999.0,41.828682,0.0,30.0,999.0,41.828682,0.0,0.0,0.0,0.0
4,mcrt,2014,0.0,30.0,999.0,40.605512,0.0,30.0,999.0,40.605512,0.0,0.0,0.0,0.0
5,mcrt,2016,0.0,30.0,999.0,43.304388,0.0,30.0,999.0,43.304388,0.0,0.0,0.0,0.0


In [22]:
df[['year', 'min_days_supply_diff', 'median_days_supply_diff', 'max_days_supply_diff', 'avg_days_supply_diff']]

Unnamed: 0,year,min_days_supply_diff,median_days_supply_diff,max_days_supply_diff,avg_days_supply_diff
0,2019,0.0,0.0,0.0,0.0
1,2017,0.0,0.0,0.0,0.0
2,2018,0.0,0.0,0.0,0.0
3,2015,0.0,0.0,0.0,0.0
4,2014,0.0,0.0,0.0,0.0
5,2016,0.0,0.0,0.0,0.0


In [23]:
connection.cursor().execute('drop table if exists dev.ip_medicare_texas_rx_daysupp;')

## Dispense as Written

In [24]:
query = '''drop table if exists qa_reporting.dw_mcrt_rx_daw_counts;
create table qa_reporting.dw_mcrt_rx_daw_counts
(
    data_source bpchar(4),
    year int,
    table_src text,
    dispensed_as_written text,
    dw_count int,
    src_count int,
    count_diff int,
    count_diff_percent numeric
)
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
query = '''
insert into qa_reporting.dw_mcrt_rx_daw_counts
(data_source, year, table_src, dispensed_as_written, dw_count)
select data_source, year, table_id_src, dispensed_as_written, count(*)
from dw_staging.mcrt_pharmacy_claims
group by 1,2,3,4;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [27]:
query = '''
with rx_daw_count as (
    select extract(year from srvc_dt::date) as year, 'pde_file' as table_src, case when daw_prod_slctn_cd is null then '00' else daw_prod_slctn_cd end as dawind, count(*)
    from medicare_texas.pde_file
    group by 1,2,3
)
update qa_reporting.dw_mcrt_rx_daw_counts a
set src_count = b.count,
count_diff = dw_count - b.count,
count_diff_percent = 100. * abs(dw_count - b.count) / b.count
from rx_daw_count b
where a.year = b.year
and a.dispensed_as_written = b.dawind
and a.table_src = b.table_src;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [28]:
df = pd.read_sql('select * from qa_reporting.dw_mcrt_rx_daw_counts', con=connection).sort_values('count_diff_percent')
df#[~df['src_count'].isna()]



Unnamed: 0,data_source,year,table_src,dispensed_as_written,dw_count,src_count,count_diff,count_diff_percent
0,mcrt,2015,pde_file,4,13534,13534,0,0.0
32,mcrt,2017,pde_file,6,25872,25872,0,0.0
33,mcrt,2018,pde_file,1,644797,644797,0,0.0
34,mcrt,2016,pde_file,7,12417,12417,0,0.0
35,mcrt,2018,pde_file,7,4137,4137,0,0.0
36,mcrt,2014,pde_file,7,5374,5374,0,0.0
37,mcrt,2014,pde_file,0,88140059,88140059,0,0.0
38,mcrt,2014,pde_file,9,11411,11411,0,0.0
39,mcrt,2015,pde_file,7,9998,9998,0,0.0
40,mcrt,2014,pde_file,5,187703,187703,0,0.0
