# Data Warehouse Medicare National QA - Claim ICD Proc

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrn_claim_icd_proc_counts;
create table qa_reporting.dw_mcrn_claim_icd_proc_counts
(
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrn_claim_icd_proc_counts
    (calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select year, uth_claim_id, uth_member_id, proc_cd, icd_version
        from dw_staging.mcrn_claim_icd_proc
        where proc_position = 1
    ) a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select year,  count(distinct claim_id_src) as count 
        from dw_staging.mcrn_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, count(distinct uth_member_id) as count 
        from dw_staging.mcrn_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mcrn_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, count(distinct member_id_src) as count 
        from dw_staging.mcrn_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    query = '''
    with mcrn_claims as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.outpatient_base_claims_k
        where icd_prcdr_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.inpatient_base_claims_k
        where icd_prcdr_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.snf_base_claims_k
        where icd_prcdr_cd1 is not null
    ),
        mcrn_claims_counts as (
            select year as calendar_year, count(*) as src_row_count, count(distinct bene_id) src_mbr_count, count(distinct clm_id) src_clm_count
            from mcrn_claims
            group by 1
        )
    update qa_reporting.dw_mcrn_claim_icd_proc_counts a
    set src_row_count = b.src_row_count,
        row_count_diff = a.dw_row_count - b.src_row_count,
        row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
        src_clm_count = b.src_clm_count,
        clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
        clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
        src_mbr_count = b.src_mbr_count,
        mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
        mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from mcrn_claims_counts b
    where a.calendar_year = b.calendar_year
    ;
    '''

    cursor.execute(query)

In [6]:
query = '''select * from qa_reporting.dw_mcrn_claim_icd_proc_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values(['clm_count_percentage'], ascending=False)



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
1,2020,258186,261858.0,-3672.0,1.402287,258186,259061,261858.0,-3672.0,1.402287,182640,182640,184264.0,-1624.0,0.881344,2023-10-09
11,2015,322539,323033.0,-494.0,0.152926,322539,323863,323033.0,-494.0,0.152926,223731,223731,223595.0,136.0,0.060824,2023-10-09
4,2019,301545,301426.0,119.0,0.039479,301545,302774,301426.0,119.0,0.039479,211072,211072,210598.0,474.0,0.225073,2023-10-09
3,2014,325958,325847.0,111.0,0.034065,325958,327488,325847.0,111.0,0.034065,224909,224909,224357.0,552.0,0.246036,2023-10-09
8,2017,313424,313514.0,-90.0,0.028707,313424,314703,313514.0,-90.0,0.028707,219985,219985,219596.0,389.0,0.177143,2023-10-09
0,2018,307491,307575.0,-84.0,0.02731,307491,308755,307575.0,-84.0,0.02731,215290,215290,214972.0,318.0,0.147926,2023-10-09
10,2016,313813,313894.0,-81.0,0.025805,313813,315094,313894.0,-81.0,0.025805,220645,220645,220282.0,363.0,0.164789,2023-10-09
2,2001,1,,,,1,1,,,,1,1,,,,2023-10-09
5,2006,2,,,,2,2,,,,1,1,,,,2023-10-09
6,2012,7,,,,7,8,,,,3,3,,,,2023-10-09


## ICD Procedure Codes

In [7]:
query = '''drop table if exists qa_reporting.dw_mcrn_icd_proc_counts;
select year, proc_cd, count(*) as proc_count
into qa_reporting.dw_mcrn_icd_proc_counts
from dw_staging.mcrn_claim_icd_proc
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [8]:
icd_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrn_icd_proc_counts;', con=connection)
icd_proc_cd_df



Unnamed: 0,year,proc_cd,proc_count
0,2018,02VX4EZ,1
1,2015,0QPJ04Z,6
2,2020,04H13DZ,3
3,2018,0X353ZZ,1
4,2020,0NSBXZZ,24
...,...,...,...
95878,2020,2Y50X5Z,1
95879,2015,HZ4BZZZ,8
95880,2020,3E1S38Z,1
95881,2016,0W9J0ZX,7


In [9]:
query = '''
select a.*
from qa_reporting.dw_mcrn_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
where b.cd_value is null;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,proc_cd,proc_count
0,2016,Z1211,1
1,2016,97535GO,3
2,2014,97535GO,7
3,2015,97535GO,3
4,2015,0B110FA,1
5,2017,97110GO,4
6,2016,97110GO,1
7,2015,97110GO,3
8,2016,45385,1
9,2015,15823,1


In [10]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2014    55
2015    29
2016    31
2017    35
Name: proc_count, dtype: int64

In [11]:
proc_comp_df = pd.DataFrame({'overall_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,1,,1,
2006,2,,2,
2010,5,,5,
2011,3,,3,
2012,11,,11,
2013,14943,,14943,
2014,911535,55.0,911480,0.006034
2015,894892,29.0,894863,0.003241
2016,880224,31.0,880193,0.003522
2017,888662,35.0,888627,0.003939


In [12]:
invalid_proc_df.groupby('year').max()

Unnamed: 0_level_0,proc_cd,proc_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,99672,9
2015,V0481,5
2016,Z1211,5
2017,97530GP,5


## Procedure Code Position

In [13]:
query = '''drop table if exists qa_reporting.dw_mcrn_proc_position;
select year, proc_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrn_proc_position
from dw_staging.mcrn_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)


In [14]:
proc_position_df = pd.read_sql('select * from qa_reporting.dw_mcrn_proc_position;', con=connection)
proc_position_df.sort_values(['year', 'proc_position'])



Unnamed: 0,year,proc_position,claim_count
186,2001,1,1
77,2006,1,2
80,2010,1,4
132,2010,4,1
41,2011,1,2
...,...,...,...
91,2020,21,829
157,2020,22,714
102,2020,23,594
21,2020,24,506


In [15]:
proc_position_df['proc_position'].unique()

array([19, 13, 18,  2,  7, 20, 22, 25, 24,  8,  3,  6, 14, 12, 21,  5, 15,
       11, 17, 10,  1, 16, 23,  9,  4], dtype=int64)

Checking if the counts for the proc_position are correct. The higher the proc_position is, the less counts there should be. If we sort the counts of the proc_position by year and assign their order, this value should match with the proc_position value.

In [16]:
proc_position_df['row_rank'] = proc_position_df.sort_values(['year', 'claim_count'], ascending=[True, False]).groupby(['year']).cumcount()+1
proc_position_df['position_check'] = proc_position_df['row_rank'] == proc_position_df['proc_position']
proc_position_df[~proc_position_df['position_check']]

Unnamed: 0,year,proc_position,claim_count,row_rank,position_check
9,2013,22,20,23,False
56,2013,21,24,22,False
132,2010,4,1,2,False
205,2013,23,26,21,False


## ICD Version

In [17]:
query = '''drop table if exists qa_reporting.dw_mcrn_icd_proc_version_count;
select year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrn_icd_proc_version_count
from dw_staging.mcrn_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [18]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_mcrn_icd_proc_version_count;', con=connection)
icd_version_df.sort_values(['year', 'icd_version'])



Unnamed: 0,year,icd_version,claim_count
6,2001,,1
5,2006,,2
10,2010,,4
8,2011,,2
4,2012,,8
11,2013,,4690
0,2014,,327488
2,2015,,323863
12,2016,,315094
1,2017,,314703
