# Data Warehouse Medicare National QA - Pharmacy Claims

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Counts and Claim Counts

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrn_pharmacy_claims_counts;
create table qa_reporting.dw_mcrn_pharmacy_claims_counts
(
    data_source bpchar(4),
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrn_pharmacy_claims_counts
    (calendar_year, table_src, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, 'ALL', count(*), count(distinct uth_rx_claim_id), current_date
    from dw_staging.mcrn_pharmacy_claims a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_pharmacy_claims_counts b
    set dw_src_clm_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct rx_claim_id_src) as count 
        from dw_staging.mcrn_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_pharmacy_claims_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct uth_member_id) as count 
        from dw_staging.mcrn_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mcrn_pharmacy_claims_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src, count(distinct member_id_src) as count 
        from dw_staging.mcrn_pharmacy_claims
    group by 1) a
    where a.year = b.calendar_year
    and b.table_src = a.table_id_src
    '''

    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    # Note that this query will run for a long time
    query = '''
    with medicare_national_claims as (
        select distinct extract(year from srvc_dt::date) as year, bene_id, pde_id
        from medicare_national.pde_file
    ),
    medicare_national_claims_counts as (
        select year as calendar_year, count(*) as src_row_count, 
                count(distinct bene_id) src_mbr_count,
                count(distinct pde_id) src_clm_count
        from medicare_national_claims
        group by 1
    )
    update qa_reporting.dw_mcrn_pharmacy_claims_counts a
    set src_row_count = b.src_row_count,
    row_count_diff = a.dw_row_count - b.src_row_count,
    row_count_diff_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
    src_clm_count = b.src_clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
    src_mbr_count = b.src_mbr_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from medicare_national_claims_counts b
    where a.calendar_year = b.calendar_year
    and a.table_src = 'ALL'
    ;
    '''

    cursor.execute(query)

In [6]:
pd.read_sql('select * from qa_reporting.dw_mcrn_pharmacy_claims_counts', con=connection).sort_values(['row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage'])



Unnamed: 0,data_source,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
0,,2015,ALL,73164346,73164346,0,0.0,73164346,73164346,73164346,0,0.0,1971585,1971585,1971585,0,0.0,2023-12-18
1,,2014,ALL,71319334,71319334,0,0.0,71319334,71319334,71319334,0,0.0,1876572,1876572,1876572,0,0.0,2023-12-18
3,,2016,ALL,75013227,75013227,0,0.0,75013227,75013227,75013227,0,0.0,2058896,2058896,2058896,0,0.0,2023-12-18
5,,2017,ALL,76007531,76007531,0,0.0,76007531,76007531,76007531,0,0.0,2147624,2147624,2147624,0,0.0,2023-12-18
6,,2018,ALL,76427412,76427412,0,0.0,76427412,76427412,76427412,0,0.0,2229242,2229242,2229242,0,0.0,2023-12-18
4,,2020,ALL,76567267,76567267,0,0.0,76567254,76567267,76567267,-13,1.7e-05,2380663,2380664,2380664,-1,4.2e-05,2023-12-18
2,,2019,ALL,76731506,76731506,0,0.0,76731485,76731506,76731506,-21,2.7e-05,2314974,2314977,2314977,-3,0.00013,2023-12-18


In [7]:
pd.read_sql('select calendar_year, dw_row_count, src_row_count, row_count_diff, dw_uth_clm_id_count, dw_src_clm_id_count, clm_count_diff, dw_uth_mbr_id_count, dw_src_mbr_id_count, mbr_count_diff from qa_reporting.dw_mcrn_pharmacy_claims_counts', con=connection).sort_values(['calendar_year'])



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,dw_uth_clm_id_count,dw_src_clm_id_count,clm_count_diff,dw_uth_mbr_id_count,dw_src_mbr_id_count,mbr_count_diff
1,2014,71319334,71319334,0,71319334,71319334,0,1876572,1876572,0
0,2015,73164346,73164346,0,73164346,73164346,0,1971585,1971585,0
3,2016,75013227,75013227,0,75013227,75013227,0,2058896,2058896,0
5,2017,76007531,76007531,0,76007531,76007531,0,2147624,2147624,0
6,2018,76427412,76427412,0,76427412,76427412,0,2229242,2229242,0
2,2019,76731506,76731506,0,76731485,76731506,-21,2314974,2314977,-3
4,2020,76567267,76567267,0,76567254,76567267,-13,2380663,2380664,-1


## NDC

In [8]:
query = '''drop table if exists qa_reporting.dw_mcrn_ndc_count;
select data_source, year, ndc, count(*)
into qa_reporting.dw_mcrn_ndc_count
from dw_staging.mcrn_pharmacy_claims
group by 1,2,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [9]:
ndc_df = pd.read_sql('select * from qa_reporting.dw_mcrn_ndc_count;', con=connection)
ndc_df



Unnamed: 0,data_source,year,ndc,count
0,mcrn,2018,11917009135,1
1,mcrn,2014,51079093220,96
2,mcrn,2020,00115442201,433
3,mcrn,2019,00168041760,111
4,mcrn,2014,00185012805,4
...,...,...,...,...
217125,mcrn,2018,70000014804,3
217126,mcrn,2017,52817024130,1
217127,mcrn,2019,00406851501,22383
217128,mcrn,2020,00008400110,7


In [10]:
ndc_df.groupby(['data_source', 'year'])['count'].sum()

data_source  year
mcrn         2014    71319334
             2015    73164346
             2016    75013227
             2017    76007531
             2018    76427412
             2019    76731506
             2020    76567267
Name: count, dtype: int64

In [11]:
query = '''
select a.*
from qa_reporting.dw_mcrn_ndc_count a
left join reference_tables.redbook b
on a.ndc = b.ndcnum
where b.ndcnum is null;
'''

missing_ndc_df = pd.read_sql(query, con=connection)
missing_ndc_df



Unnamed: 0,data_source,year,ndc,count
0,mcrn,2020,69452011920,181
1,mcrn,2016,62011008601,6
2,mcrn,2015,62011008601,3
3,mcrn,2018,93764060106,1
4,mcrn,2016,79854001162,4
...,...,...,...,...
14876,mcrn,2020,59746071130,47
14877,mcrn,2020,29300026881,22
14878,mcrn,2020,50428057834,3
14879,mcrn,2020,64842072709,71


In [12]:
missing_ndc_df.groupby('year')['count'].sum()

year
2014      30978
2015      27242
2016      26672
2017      29947
2018      61785
2019     359883
2020    2124199
Name: count, dtype: int64

In [13]:
ndc_comp_df = pd.DataFrame({'overall_count': ndc_df.groupby('year')['count'].sum(),
                            'invalid_count': missing_ndc_df.groupby('year')['count'].sum(),
                            'valid_count': ndc_df.groupby('year')['count'].sum() - missing_ndc_df.groupby('year')['count'].sum()})
ndc_comp_df.loc[ndc_comp_df['valid_count'].isna(),'valid_count'] = ndc_comp_df.loc[ndc_comp_df['valid_count'].isna(),'overall_count']
ndc_comp_df['valid_count'] =  ndc_comp_df['valid_count'].astype(int)
ndc_comp_df['invalid_to_valid_percent'] = 100. * ndc_comp_df['invalid_count'] / ndc_comp_df['valid_count']
ndc_comp_df

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,71319334,30978,71288356,0.043455
2015,73164346,27242,73137104,0.037248
2016,75013227,26672,74986555,0.035569
2017,76007531,29947,75977584,0.039416
2018,76427412,61785,76365627,0.080907
2019,76731506,359883,76371623,0.471226
2020,76567267,2124199,74443068,2.853454


## Days Supply

In [14]:
query = '''drop table if exists qa_reporting.dw_mcrn_rx_days_supply;
create table qa_reporting.dw_mcrn_rx_days_supply
(
    data_source bpchar(4),
    year int,
    dw_min_days_supply numeric,
    dw_median_days_supply numeric,
    dw_max_days_supply numeric,
    dw_avg_days_supply numeric,
    src_min_days_supply numeric,
    src_median_days_supply numeric,
    src_max_days_supply numeric,
    src_avg_days_supply numeric,
    min_days_supply_diff numeric,
    median_days_supply_diff numeric,
    max_days_supply_diff numeric,
    avg_days_supply_diff numeric
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [15]:
with connection.cursor() as cursor:
    query = '''
    insert into qa_reporting.dw_mcrn_rx_days_supply
    (data_source, year, dw_min_days_supply)
    select data_source, year, min(days_supply)
    from dw_staging.mcrn_pharmacy_claims
    group by 1,2
    '''

    cursor.execute(query)


    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set dw_median_days_supply = n
    from (
        select data_source, year, median(days_supply) n
        from dw_staging.mcrn_pharmacy_claims
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set dw_max_days_supply = n
    from (
        select data_source, year, max(days_supply) n
        from dw_staging.mcrn_pharmacy_claims
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set dw_avg_days_supply = n
    from (
        select data_source, year, avg(days_supply) n
        from dw_staging.mcrn_pharmacy_claims
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

In [16]:
with connection.cursor() as cursor:
    query ='''drop table if exists dev.ip_medicare_national_rx_daysupp;

create table dev.ip_medicare_national_rx_daysupp
with (
appendonly=true, 
orientation=row, 
compresstype=zlib, 
compresslevel=5 
)
as 
select distinct 'mcrn' as data_source, extract(year from srvc_dt::date) as year, bene_id, pde_id, days_suply_num
from medicare_national.pde_file
distributed by (pde_id);
    '''

    cursor.execute(query)

In [17]:
with connection.cursor() as cursor:
    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set src_min_days_supply = n
    from (
        select data_source, year, min(days_suply_num::numeric) n
        from dev.ip_medicare_national_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)


    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set src_median_days_supply = n
    from (
        select data_source, year, median(days_suply_num::numeric) n
        from dev.ip_medicare_national_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set src_max_days_supply = n
    from (
        select data_source, year, max(days_suply_num::numeric) n
        from dev.ip_medicare_national_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply a
    set src_avg_days_supply = n
    from (
        select data_source, year, avg(days_suply_num::numeric) n
        from dev.ip_medicare_national_rx_daysupp
        group by 1,2
    ) b
    where a.year = b.year
    and a.data_source = b.data_source;
    '''

    cursor.execute(query)

In [18]:
with connection.cursor() as cursor:
    query = '''
    update qa_reporting.dw_mcrn_rx_days_supply
    set min_days_supply_diff = dw_min_days_supply - src_min_days_supply,
    median_days_supply_diff = dw_median_days_supply - src_median_days_supply,
    max_days_supply_diff = dw_max_days_supply - src_max_days_supply,
    avg_days_supply_diff = dw_avg_days_supply - src_avg_days_supply
    ;
    '''

    cursor.execute(query)

In [19]:
df = pd.read_sql('select * from qa_reporting.dw_mcrn_rx_days_supply;', con=connection)
df



Unnamed: 0,data_source,year,dw_min_days_supply,dw_median_days_supply,dw_max_days_supply,dw_avg_days_supply,src_min_days_supply,src_median_days_supply,src_max_days_supply,src_avg_days_supply,min_days_supply_diff,median_days_supply_diff,max_days_supply_diff,avg_days_supply_diff
0,mcrn,2014,0.0,30.0,999.0,39.556031,0.0,30.0,999.0,39.556031,0.0,0.0,0.0,0.0
1,mcrn,2019,0.0,30.0,999.0,46.965997,0.0,30.0,999.0,46.965997,0.0,0.0,0.0,0.0
2,mcrn,2017,0.0,30.0,999.0,43.230995,0.0,30.0,999.0,43.230995,0.0,0.0,0.0,0.0
3,mcrn,2016,0.0,30.0,999.0,41.772888,0.0,30.0,999.0,41.772888,0.0,0.0,0.0,0.0
4,mcrn,2015,0.0,30.0,999.0,40.559333,0.0,30.0,999.0,40.559333,0.0,0.0,0.0,0.0
5,mcrn,2020,0.0,30.0,999.0,49.471745,0.0,30.0,999.0,49.471745,0.0,0.0,0.0,0.0
6,mcrn,2018,0.0,30.0,999.0,45.089088,0.0,30.0,999.0,45.089088,0.0,0.0,0.0,0.0


In [20]:
df[['year', 'min_days_supply_diff', 'median_days_supply_diff', 'max_days_supply_diff', 'avg_days_supply_diff']]

Unnamed: 0,year,min_days_supply_diff,median_days_supply_diff,max_days_supply_diff,avg_days_supply_diff
0,2014,0.0,0.0,0.0,0.0
1,2019,0.0,0.0,0.0,0.0
2,2017,0.0,0.0,0.0,0.0
3,2016,0.0,0.0,0.0,0.0
4,2015,0.0,0.0,0.0,0.0
5,2020,0.0,0.0,0.0,0.0
6,2018,0.0,0.0,0.0,0.0


In [21]:
connection.cursor().execute('drop table if exists dev.ip_medicare_national_rx_daysupp;')

## Dispense as Written

In [22]:
query = '''drop table if exists qa_reporting.dw_mcrn_rx_daw_counts;
create table qa_reporting.dw_mcrn_rx_daw_counts
(
    data_source bpchar(4),
    year int,
    table_src text,
    dispensed_as_written text,
    dw_count int,
    src_count int,
    count_diff int,
    count_diff_percent numeric
)
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [23]:
query = '''
insert into qa_reporting.dw_mcrn_rx_daw_counts
(data_source, year, table_src, dispensed_as_written, dw_count)
select data_source, year, table_id_src, dispensed_as_written, count(*)
from dw_staging.mcrn_pharmacy_claims
group by 1,2,3,4;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [24]:
query = '''
with rx_daw_count as (
    select extract(year from srvc_dt::date) as year, 'pde_file' as table_src, case when daw_prod_slctn_cd is null then '00' else daw_prod_slctn_cd end as dawind, count(*)
    from medicare_national.pde_file
    group by 1,2,3
)
update qa_reporting.dw_mcrn_rx_daw_counts a
set src_count = b.count,
count_diff = dw_count - b.count,
count_diff_percent = 100. * abs(dw_count - b.count) / b.count
from rx_daw_count b
where a.year = b.year
and a.dispensed_as_written = b.dawind
and a.table_src = b.table_src;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
df = pd.read_sql('select * from qa_reporting.dw_mcrn_rx_daw_counts', con=connection).sort_values('count_diff_percent')
df#[~df['src_count'].isna()]



Unnamed: 0,data_source,year,table_src,dispensed_as_written,dw_count,src_count,count_diff,count_diff_percent
0,mcrn,2016,pde_file,4,8791,8791,0,0.0
37,mcrn,2020,pde_file,4,7105,7105,0,0.0
38,mcrn,2018,pde_file,2,248857,248857,0,0.0
39,mcrn,2014,pde_file,6,12864,12864,0,0.0
40,mcrn,2017,pde_file,5,102353,102353,0,0.0
...,...,...,...,...,...,...,...,...
21,mcrn,2018,pde_file,5,102094,102094,0,0.0
20,mcrn,2019,pde_file,1,709970,709970,0,0.0
19,mcrn,2019,pde_file,0,75265555,75265555,0,0.0
25,mcrn,2017,pde_file,7,9024,9024,0,0.0
