# Data Warehouse Medicare Texas QA - Claim ICD Proc

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrt_claim_icd_proc_counts;
create table qa_reporting.dw_mcrt_claim_icd_proc_counts
(
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrt_claim_icd_proc_counts
    (calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select year, uth_claim_id, uth_member_id, proc_cd, icd_version
        from dw_staging.mcrt_claim_icd_proc
        where proc_position = 1
    ) a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select year,  count(distinct claim_id_src) as count 
        from dw_staging.mcrt_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, count(distinct uth_member_id) as count 
        from dw_staging.mcrt_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mcrt_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, count(distinct member_id_src) as count 
        from dw_staging.mcrt_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

In [10]:
with connection.cursor() as cursor:
    query = '''
    with mcrt_claims as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.outpatient_base_claims_k
        where icd_prcdr_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.inpatient_base_claims_k
        where icd_prcdr_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.snf_base_claims_k
        where icd_prcdr_cd1 is not null
    ),
        mcrt_claims_counts as (
            select year as calendar_year, count(*) as src_row_count, count(distinct bene_id) src_mbr_count, count(distinct clm_id) src_clm_count
            from mcrt_claims
            group by 1
        )
    update qa_reporting.dw_mcrt_claim_icd_proc_counts a
    set src_row_count = b.src_row_count,
        row_count_diff = a.dw_row_count - b.src_row_count,
        row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
        src_clm_count = b.src_clm_count,
        clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
        clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
        src_mbr_count = b.src_mbr_count,
        mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
        mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from mcrt_claims_counts b
    where a.calendar_year = b.calendar_year
    ;
    '''

    cursor.execute(query)

In [11]:
query = '''select * from qa_reporting.dw_mcrt_claim_icd_proc_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values(['clm_count_percentage'], ascending=False)



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
1,2020,380801,386293.0,-5492.0,1.421719,380801,382242,386293.0,-5492.0,1.421719,256800,256800,259000.0,-2200.0,0.849421,2023-10-04
9,2015,469359,470151.0,-792.0,0.168457,469359,471541,470151.0,-792.0,0.168457,313053,313053,313001.0,52.0,0.016613,2023-10-04
8,2016,460561,461002.0,-441.0,0.095661,460561,462576,461002.0,-441.0,0.095661,309440,309440,309114.0,326.0,0.105463,2023-10-04
0,2018,433847,434157.0,-310.0,0.071403,433847,435959,434157.0,-310.0,0.071403,290063,290063,289752.0,311.0,0.107333,2023-10-04
6,2017,456247,456087.0,160.0,0.035081,456247,458224,456087.0,160.0,0.035081,306956,306956,306298.0,658.0,0.214823,2023-10-04
4,2019,432499,432454.0,45.0,0.010406,432499,434631,432454.0,45.0,0.010406,287371,287371,286737.0,634.0,0.221109,2023-10-04
3,2014,475814,475844.0,-30.0,0.006305,475814,478227,475844.0,-30.0,0.006305,315675,315675,315031.0,644.0,0.204424,2023-10-04
2,2001,1,,,,1,2,,,,2,2,,,,2023-10-04
5,2012,1,,,,1,1,,,,1,1,,,,2023-10-04
7,2013,6858,,,,6858,7587,,,,7574,7574,,,,2023-10-04


## ICD Procedure Codes

In [12]:
query = '''drop table if exists qa_reporting.dw_mcrt_icd_proc_counts;
select year, proc_cd, count(*) as proc_count
into qa_reporting.dw_mcrt_icd_proc_counts
from dw_staging.mcrt_claim_icd_proc
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
icd_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrt_icd_proc_counts;', con=connection)
icd_proc_cd_df



Unnamed: 0,year,proc_cd,proc_count
0,2017,0DF68ZZ,3
1,2020,0X360ZZ,1
2,2016,2W2QX4Z,11
3,2016,06BQ4ZZ,2588
4,2018,05VF3DZ,1
...,...,...,...
103279,2019,0Y6H0Z3,201
103280,2017,0HBRXZX,2
103281,2017,F07Z4FZ,678
103282,2018,0T518ZZ,2


In [14]:
query = '''
select a.*
from qa_reporting.dw_mcrt_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
where b.cd_value is null;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,proc_cd,proc_count
0,2017,92611,1
1,2017,OSRC0J9,1
2,2016,92611,2
3,2015,0bbb8zx,1
4,2015,ORRK00Z,1
5,2017,OSRD0J9,1
6,2016,0HQExZZ,1
7,2016,0000,2
8,2014,0000,1
9,2018,R1310,1


In [15]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2014    1
2015    2
2016    5
2017    3
2018    1
Name: proc_count, dtype: int64

In [16]:
proc_comp_df = pd.DataFrame({'overall_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,2,,2,
2012,2,,2,
2013,22253,,22253,
2014,1293404,1.0,1293403,7.7e-05
2015,1292870,2.0,1292868,0.000155
2016,1315006,5.0,1315001,0.00038
2017,1319419,3.0,1319416,0.000227
2018,1327947,1.0,1327946,7.5e-05
2019,1325813,,1325813,
2020,1199576,,1199576,


In [17]:
invalid_proc_df.groupby('year').max()

Unnamed: 0_level_0,proc_cd,proc_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,0000,1
2015,ORRK00Z,1
2016,92611,2
2017,OSRD0J9,1
2018,R1310,1


## Procedure Code Position

In [18]:
query = '''drop table if exists qa_reporting.dw_mcrt_proc_position;
select year, proc_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrt_proc_position
from dw_staging.mcrt_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)


In [19]:
proc_position_df = pd.read_sql('select * from qa_reporting.dw_mcrt_proc_position;', con=connection)
proc_position_df.sort_values(['year', 'proc_position'])



Unnamed: 0,year,proc_position,claim_count
127,2001,1,1
139,2001,2,1
72,2012,1,1
153,2012,2,1
173,2013,1,6858
...,...,...,...
33,2020,21,1457
74,2020,22,1239
66,2020,23,1052
115,2020,24,900


In [20]:
proc_position_df['proc_position'].unique()

array([19,  6, 23, 22, 25, 20, 10, 24, 17, 12, 15,  2,  9, 16,  5,  8, 11,
       21,  4, 14,  7, 13, 18,  1,  3], dtype=int64)

Checking if the counts for the proc_position are correct. The higher the proc_position is, the less counts there should be. If we sort the counts of the proc_position by year and assign their order, this value should match with the proc_position value.

In [21]:
proc_position_df['row_rank'] = proc_position_df.sort_values(['year', 'claim_count'], ascending=[True, False]).groupby(['year']).cumcount()+1
proc_position_df['position_check'] = proc_position_df['row_rank'] == proc_position_df['proc_position']
proc_position_df[~proc_position_df['position_check']]

Unnamed: 0,year,proc_position,claim_count,row_rank,position_check
3,2013,19,41,18,False
79,2013,18,40,19,False
172,2013,22,21,23,False
186,2013,23,24,22,False


## ICD Version

In [22]:
query = '''drop table if exists qa_reporting.dw_mcrt_icd_proc_version_count;
select year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrt_icd_proc_version_count
from dw_staging.mcrt_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [24]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_mcrt_icd_proc_version_count;', con=connection)
icd_version_df.sort_values(['year', 'icd_version'])

Unnamed: 0,year,icd_version,claim_count
6,2001,,2
2,2012,,1
4,2013,,7587
7,2014,,478227
8,2015,,471541
9,2016,,462576
0,2017,,458224
3,2018,,435959
5,2019,,434631
1,2020,,382242
