# Data Warehouse Medicaid QA - Claim ICD Proc

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [21]:
connection = psycopg2.connect(get_dsn())
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mdcd_claim_icd_proc_counts;
create table qa_reporting.dw_mdcd_claim_icd_proc_counts
(
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mdcd_claim_icd_proc_counts
    (calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, count(a.*), count(distinct uth_claim_id), current_date
    from (select year, uth_claim_id, uth_member_id, proc_cd, icd_version
    from dw_staging.mdcd_claim_icd_proc
    where proc_position = 1) a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mdcd_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select year,  count(distinct claim_id_src) as count 
        from dw_staging.mdcd_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mdcd_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, count(distinct uth_member_id) as count 
        from dw_staging.mdcd_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mdcd_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, count(distinct member_id_src) as count 
        from dw_staging.mdcd_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

In [18]:
with connection.cursor() as cursor:
    query = '''
    with medicaid_claims as (
    select extract(year from hdr_frm_dos::date) as year, a.icn, pcn
    from medicaid.clm_proc a
    join medicaid.clm_header b
    on a.icn = b.icn
    where proc_icd_cd_1 <> ''
    union
    select extract(year from frm_dos), a.derv_enc, mem_id
    from medicaid.enc_proc a
    join medicaid.enc_header b
    on a.derv_enc = b.derv_enc
    where prim_proc_cd <> ''
    union
    select extract(year from hdr_frm_dos::date), a.icn, pcn
    from medicaid.htw_clm_proc a
    join medicaid.htw_clm_header b
    on a.icn = b.icn
    where proc_icd_cd_1 <> ''
        ),
        medicaid_claims_counts as (
            select year as calendar_year, count(*) as src_row_count, count(distinct icn) src_mbr_count, count(distinct pcn) src_clm_count
            from medicaid_claims
            group by 1
        )
    update qa_reporting.dw_mdcd_claim_icd_proc_counts a
    set src_row_count = b.src_row_count,
    row_count_diff = a.dw_row_count - b.src_row_count,
    row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
    src_clm_count = b.src_clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
    src_mbr_count = b.src_mbr_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from medicaid_claims_counts b
    where a.calendar_year = b.calendar_year
    ;
    '''

    cursor.execute(query)

Here we see that, for most years, the row count is the same. However, the member and claim id counts do not match up

In [19]:
query = '''select * from qa_reporting.dw_mdcd_claim_icd_proc_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values(['clm_count_percentage'], ascending=False)



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
6,2012,534306,534306,0,0.0,534306,534311,354003,180303,50.932619,354006,354006,534306,-180300,33.744708,2023-07-17
5,2011,184033,184033,0,0.0,184033,184033,131226,52807,40.241263,131226,131226,184033,-52807,28.69431,2023-07-17
8,2013,721511,721511,0,0.0,721511,721517,517282,204229,39.481173,517285,517285,721511,-204226,28.30532,2023-07-17
9,2016,695495,695495,0,0.0,695495,695508,515430,180065,34.934909,515432,515432,695495,-180063,25.889906,2023-07-17
3,2014,696880,696880,0,0.0,696880,696898,520617,176263,33.856559,520628,520628,696880,-176252,25.291585,2023-07-17
10,2015,687666,687666,0,0.0,687666,688956,517307,170359,32.931895,518402,518402,687666,-169264,24.614275,2023-07-17
0,2018,653846,653846,0,0.0,653846,653906,495215,158631,32.032753,495259,495259,653846,-158587,24.254488,2023-07-17
4,2019,721681,639284,82397,12.888951,639284,639293,487017,152267,31.265233,487017,487017,639284,-152267,23.818366,2023-07-17
7,2017,657206,657206,0,0.0,657206,657238,502663,154543,30.744853,502685,502685,657206,-154521,23.511806,2023-07-17
1,2020,749259,607515,141744,23.33177,607515,607522,467263,140252,30.015644,467265,467265,607515,-140250,23.08585,2023-07-17


## ICD Procedure Codes

In [73]:
query = '''drop table if exists qa_reporting.dw_mdcd_icd_proc_counts;
select year, proc_cd, case when icd_version = '0' then '10' else '9' end as icd_version, count(*) as proc_count
into qa_reporting.dw_mdcd_icd_proc_counts
from dw_staging.mdcd_claim_icd_proc
group by 1,2,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [38]:
icd_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mdcd_icd_proc_counts;', con=connection)
icd_proc_cd_df

Unnamed: 0,year,proc_cd,icd_version,proc_count
0,2017,B31F0ZZ,0,2
1,2017,0JXN0ZB,0,1
2,2016,0Y980ZX,0,4
3,2016,3E0E3KZ,0,5
4,2012,295,9,8
...,...,...,...,...
128792,2019,0H9EXZZ,0,41
128793,2020,XNS3032,0,2
128794,2020,047Y3EZ,0,2
128795,2018,0J9N00Z,0,25


Here we check if the ICD procedure code is invalid according to the reference table we have for ICD procedure codes.

In [None]:
query = '''
select a.*
from qa_reporting.dw_mdcd_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
and b.cd_type like 'ICD' || a.icd_version ||  '%'
where b.cd_value is null;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,proc_cd,icd_version,proc_count
0,2018,09TV8Z,10,1
1,2015,3E05317,9,2
2,2017,43760,10,2
3,2015,8E0U,10,1
4,2019,30E0P7V,10,2
...,...,...,...,...
14769,2015,B2141ZZ,9,1
14770,2015,0BQR,10,1
14771,2017,3E033YJ,10,1
14772,2013,077,9,97


In [None]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2011     54867
2012    112276
2013    106550
2014    104436
2015     80414
2016     25146
2017     22139
2018     28793
2019     13198
2020      1934
2021       999
Name: proc_count, dtype: int64

there are more valid codes for recent years

In [113]:
proc_comp_df = pd.DataFrame({'overall_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df['valid_percent'] = 100. * proc_comp_df['valid_proc_count'] / proc_comp_df['overall_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent,valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011,398361,54867,343494,15.973205,86.226814
2012,1189594,112276,1077318,10.421807,90.561822
2013,1541282,106550,1434732,7.426474,93.086924
2014,1506901,104436,1402465,7.446603,93.069485
2015,1544231,80414,1463817,5.493446,94.792618
2016,1631719,25146,1606573,1.565195,98.458926
2017,1558304,22139,1536165,1.441186,98.579289
2018,1616101,28793,1587308,1.813952,98.218366
2019,1819267,13198,1806069,0.730758,99.274543
2020,1976494,1934,1974560,0.097946,99.90215


In [None]:
invalid_proc_df.groupby('year').max()

Unnamed: 0_level_0,proc_cd,icd_version,proc_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011,J7040,9,18686
2012,ADMIT,9,33342
2013,C1893,9,29861
2014,Z412,9,29593
2015,Z9999,9,20627
2016,ZW3LX1Z,9,3531
2017,ZE033VJ,9,2832
2018,Z9999,9,3562
2019,Z9999,9,1443
2020,Z9999,9,125


### ICD 9 Proc Cds

Now let's check if the invalid ICD 9 codes.

In [125]:
query = '''
select a.*
from qa_reporting.dw_mdcd_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
and b.cd_type like 'ICD' || a.icd_version ||  '%'
where b.cd_value is null
and icd_version = '9'
'''

invalid_icd9_proc_cd_df = pd.read_sql(query, con=connection)
invalid_icd9_proc_cd_df

Unnamed: 0,year,proc_cd,icd_version,proc_count
0,2019,0S9C3ZX,9,2
1,2015,763,9,2
2,2014,763,9,2
3,2012,763,9,1
4,2017,00N00ZZ,9,1
...,...,...,...,...
4902,2015,4330,9,111
4903,2011,4330,9,78
4904,2012,4330,9,159
4905,2013,4330,9,115


In [126]:
invalid_icd9_proc_cd_df['proc_cd'].unique().shape

(2577,)

Checking if there are any 3-4 character length codes that were not in the reference table

In [None]:
invalid_icd9_proc_cd_df[invalid_icd9_proc_cd_df['proc_cd'].str.match('^\d{3,4}$')]

Unnamed: 0,year,proc_cd,icd_version,proc_count
1,2015,763,9,2
2,2014,763,9,2
3,2012,763,9,1
11,2015,5570,9,2
12,2014,5570,9,3
...,...,...,...,...
4901,2014,4330,9,132
4902,2015,4330,9,111
4903,2011,4330,9,78
4904,2012,4330,9,159


In [None]:
# unique icd9 codes that were not valid
invalid_icd9_proc_cd_df[invalid_icd9_proc_cd_df['proc_cd'].str.match('^\d{3,4}$')]['proc_cd'].unique().shape

(734,)

In [129]:
query = '''
select a.*
from qa_reporting.dw_mdcd_icd_proc_counts a
join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
and b.cd_type like 'ICD' || a.icd_version ||  '%'
where icd_version = '9'
'''

valid_icd9_proc_cd_df = pd.read_sql(query, con=connection)
valid_icd9_proc_cd_df



Unnamed: 0,year,proc_cd,icd_version,proc_count
0,2014,9135,9,1
1,2015,8083,9,4
2,2011,8083,9,2
3,2014,8083,9,6
4,2013,8083,9,5
...,...,...,...,...
12775,2012,8095,9,15
12776,2015,8095,9,10
12777,2011,8095,9,4
12778,2014,8095,9,12


In [130]:
valid_icd9_proc_cd_df['proc_cd'].unique().shape

(3085,)

Let's check the examples of invalid codes based on certain patterns. In this case we just look at the code itself and ignore the frequency of the code.

In [101]:
icd9_proc_cd_df = pd.read_sql('''select distinct proc_cd from qa_reporting.dw_mdcd_icd_proc_counts where icd_version = '9' and year < 2016;''', con=connection)
valid_icd9_proc_cd_df = icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{3,4}$')]
n_valid_codes = valid_icd9_proc_cd_df.shape[0]
n_invalid_codes = 0



In [102]:
icd9_proc_cd_df.shape[0]

4899

In [103]:
valid_icd9_proc_cd_df

Unnamed: 0,proc_cd
1,1642
3,600
5,4569
6,5251
8,2172
...,...
4893,6902
4895,6825
4896,3770
4897,4562


In [104]:
n_invalid_codes += icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{0,2}$')].shape[0]
print(icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{0,2}$')].shape[0])
icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{0,2}$')]

27


Unnamed: 0,proc_cd
834,
919,3.0
1061,0.0
1081,46.0
1084,66.0
1207,24.0
1358,48.0
1383,4.0
1449,33.0
1634,17.0


In [105]:
n_invalid_codes += icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{5,10}$')].shape[0]
print(icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{5,10}$')].shape[0])
icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^\d{5,10}$')]

778


Unnamed: 0,proc_cd
0,04522
2,01254
4,21499
7,05390
10,36415
...,...
4858,58345
4868,59400
4869,19182
4871,04621


In [106]:
n_invalid_codes += icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^(\d*[a-zA-Z]+\d*)+$')].shape[0]
print(icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^(\d*[a-zA-Z]+\d*)+$')].shape[0])
icd9_proc_cd_df[icd9_proc_cd_df['proc_cd'].str.match('^(\d*[a-zA-Z]+\d*)+$')]

271


Unnamed: 0,proc_cd
21,0243N1
80,BBC3ZX
81,0DJ08ZZ
112,T1015
119,246ZZ4
...,...
4827,0U570ZZ
4835,0DTJ0ZZ
4839,0UL70ZZ
4884,0KX40ZZ


In [107]:
n_invalid_codes, n_valid_codes, n_valid_codes + n_invalid_codes, icd9_proc_cd_df.shape[0]

(1076, 3819, 4895, 4899)

78.0% of the unique ICD 9 procedure codes in the medicaid data is 3-4 characters long

In [121]:
1.* n_valid_codes / icd9_proc_cd_df.shape[0]

0.7795468462951622