# Data Warehouse Medicare National QA - Claim Diag

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn())
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrn_claim_diag_counts;
create table qa_reporting.dw_mcrn_claim_diag_counts
(
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrn_claim_diag_counts
    (calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, count(*), count(distinct uth_claim_id), current_date
    from dw_staging.mcrn_claim_diag
    where diag_position = 1
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_diag_counts b
    set dw_src_clm_id_count = count
    from (
        select year,  count(distinct claim_id_src) as count 
        from dw_staging.mcrn_claim_diag
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_diag_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, count(distinct uth_member_id) as count 
        from dw_staging.mcrn_claim_diag
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mcrn_claim_diag_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, count(distinct member_id_src) as count 
        from dw_staging.mcrn_claim_diag
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    query = '''
    with clms as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.hha_base_claims_k
        where icd_dgns_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.outpatient_base_claims_k
        where icd_dgns_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.dme_claims_k
        where icd_dgns_cd1 is not null
        union
        select extract(year from clm_thru_dt::date)as year, bene_id, clm_id
        from medicare_national.inpatient_base_claims_k
        where icd_dgns_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.bcarrier_claims_k
        where icd_dgns_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.hospice_base_claims_k
        where icd_dgns_cd1 is not null
        union
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.snf_base_claims_k
        where icd_dgns_cd1 is not null
    ),
    clm_counts as (
        select year, count(*) row_count, count(distinct bene_id) pat_count, count(distinct clm_id) clm_count
        from clms
        group by 1
    )
    update qa_reporting.dw_mcrn_claim_diag_counts a
    set src_row_count = b.row_count,
    row_count_diff = a.dw_row_count - b.row_count,
    row_count_percentage = 100. * abs(a.dw_row_count - b.row_count) / b.row_count,
    src_clm_count = b.clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.clm_count) / b.clm_count,
    src_mbr_count = b.pat_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.pat_count,
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from clm_counts b
    where a.calendar_year = b.year
    ;
    '''

    cursor.execute(query)

In [6]:
query = '''select * from qa_reporting.dw_mcrn_claim_diag_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values(['clm_count_percentage'], ascending=False)



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
2,2020,53420880,53494724.0,-73844.0,0.13804,53420880,53420880,53494724.0,-73844.0,0.13804,1853702,1853702,1854639.0,-937.0,0.050522,2023-10-09
7,2014,58037677,58057405.0,-19728.0,0.03398,58037677,58037677,58057405.0,-19728.0,0.03398,1852503,1852503,1853140.0,-637.0,0.034374,2023-10-09
0,2018,60197618,60210464.0,-12846.0,0.021335,60197618,60197618,60210464.0,-12846.0,0.021335,1930824,1930824,1931573.0,-749.0,0.038777,2023-10-09
9,2019,60543281,60550050.0,-6769.0,0.011179,60543281,60543281,60550050.0,-6769.0,0.011179,1910418,1910418,1910942.0,-524.0,0.027421,2023-10-09
12,2017,59673907,59677566.0,-3659.0,0.006131,59673907,59673907,59677566.0,-3659.0,0.006131,1910059,1910059,1910438.0,-379.0,0.019838,2023-10-09
21,2015,58864100,58866192.0,-2092.0,0.003554,58864100,58864100,58866192.0,-2092.0,0.003554,1873335,1873335,1873851.0,-516.0,0.027537,2023-10-09
14,2016,59668121,59670191.0,-2070.0,0.003469,59668121,59668121,59670191.0,-2070.0,0.003469,1899201,1899201,1899637.0,-436.0,0.022952,2023-10-09
1,2000,2,,,,2,2,,,,2,2,,,,2023-10-09
3,2002,4,,,,4,4,,,,4,4,,,,2023-10-09
4,2008,7,,,,7,7,,,,7,7,,,,2023-10-09


## Diagnosis Codes

Here we will check if we have valid ICD Diagnosis codes using our reference tables.

In [7]:
query = '''drop table if exists qa_reporting.dw_mcrn_diag_counts;
select year, diag_cd, count(*) as diag_count
into qa_reporting.dw_mcrn_diag_counts
from dw_staging.mcrn_claim_diag
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [8]:
diag_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrn_diag_counts;', con=connection)
diag_cd_df



Unnamed: 0,year,diag_cd,diag_count
0,2017,S52292G,4
1,2016,S1111XA,3
2,2015,72981,159297
3,2017,S52009S,2
4,2015,T391X5S,2
...,...,...,...
295647,2019,G511,59
295648,2014,9156,1097
295649,2015,S92421A,55
295650,2019,M20002,241


We see that we have a lot of claims with diagnosis codes that are not in our reference list. If we take a further look at some of these codes, we see that in our reference list, there is at least one more digit missing from these codes. The missing digits helps specify the diagnosis code.

In [9]:
query = '''
select a.*
from qa_reporting.dw_mcrn_diag_counts a
left join reference_tables.ref_cms_icd_cm_codes b
on a.diag_cd = cd_value
where b.cd_value is null;
'''

invalid_diag_df = pd.read_sql(query, con=connection)
invalid_diag_df



Unnamed: 0,year,diag_cd,diag_count
0,2015,140,1
1,2014,78710,1
2,2020,R0502,1
3,2015,V78842,2
4,2014,444222,1
...,...,...,...
1417,2015,L972090,1
1418,2014,V1899,1
1419,2018,G359,1
1420,2020,5335XXA,1


Overall, the number of invalid diagnosis codes is neglible compared to the overall number of diagnosis codes in the claim_diag table.

In [10]:
invalid_diag_df.groupby('year')['diag_count'].sum()

year
2013       1
2014    1327
2015    1064
2016     178
2017     220
2018     165
2019     223
2020      75
Name: diag_count, dtype: int64

In [11]:
diag_comp_df = pd.DataFrame({'overall_diag_count': diag_cd_df.groupby('year')['diag_count'].sum(),
                            'invalid_diag_count': invalid_diag_df.groupby('year')['diag_count'].sum(),
                            'valid_diag_count': diag_cd_df.groupby('year')['diag_count'].sum() - invalid_diag_df.groupby('year')['diag_count'].sum()})
diag_comp_df.loc[diag_comp_df['valid_diag_count'].isna(),'valid_diag_count'] = diag_comp_df.loc[diag_comp_df['valid_diag_count'].isna(),'overall_diag_count']
diag_comp_df['valid_diag_count'] =  diag_comp_df['valid_diag_count'].astype(int)
diag_comp_df['invalid_to_valid_percent'] = 100. * diag_comp_df['invalid_diag_count'] / diag_comp_df['valid_diag_count']
diag_comp_df  

Unnamed: 0_level_0,overall_diag_count,invalid_diag_count,valid_diag_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997,25,,25,
2000,18,,18,
2001,4,,4,
2002,70,,70,
2003,27,,27,
2004,66,,66,
2005,1,,1,
2006,57,,57,
2007,31,,31,
2008,52,,52,


In [12]:
invalid_diag_df.groupby('year').max()

Unnamed: 0_level_0,diag_cd,diag_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,250,1
2014,XX000,475
2015,Z877440,381
2016,i8511,10
2017,ZG894,18
2018,ZI211,17
2019,Z96561,20
2020,Z96561,7


## Diagnosis Code Position

In [13]:
query = '''drop table if exists qa_reporting.dw_mcrn_diag_position;
select year, diag_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrn_diag_position
from dw_staging.mcrn_claim_diag
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)


In [14]:
diag_position_df = pd.read_sql('select * from qa_reporting.dw_mcrn_diag_position;', con=connection)
diag_position_df.sort_values(['year', 'diag_position'])



Unnamed: 0,year,diag_position,claim_count
155,1997,1,3
322,1997,2,3
127,1997,3,3
225,1997,4,3
60,1997,5,3
...,...,...,...
173,2020,21,300007
111,2020,22,258659
406,2020,23,222236
150,2020,24,189322


In [15]:
diag_position_df['diag_position'].unique()

array([ 2,  9, 18, 21, 25,  6, 24, 10,  7,  3, 16,  4, 20, 22, 14,  1,  8,
        5, 23, 13, 19, 12, 11, 15, 17], dtype=int64)

Checking if the counts for the diag_position are correct. The higher the diag_position is, the less counts there should be. If we sort the counts of the diag_position by year and assign their order, this value should match with the diag_position value.

In [16]:
diag_position_df['row_rank'] = diag_position_df.sort_values(['year', 'claim_count'], ascending=[True, False]).groupby(['year']).cumcount()+1
diag_position_df['position_check'] = diag_position_df['row_rank'] == diag_position_df['diag_position']
diag_position_df[~diag_position_df['position_check']]

Unnamed: 0,year,diag_position,claim_count,row_rank,position_check
1,2003,9,3,1,False
2,2007,18,1,6,False
7,2006,24,1,16,False
9,2006,18,1,17,False
11,2010,7,27,6,False
...,...,...,...,...,...
410,2004,6,5,9,False
414,2009,4,7,7,False
416,2010,11,13,12,False
418,2002,21,2,23,False


In [19]:
diag_position_df[~diag_position_df['position_check']].sort_values('claim_count', ascending=False)

Unnamed: 0,year,diag_position,claim_count,row_rank,position_check
397,2010,2,33,3,False
78,2010,3,33,2,False
272,2010,6,27,7,False
11,2010,7,27,6,False
91,2010,9,26,8,False
...,...,...,...,...,...
147,2002,24,1,25,False
145,2000,11,1,8,False
128,2007,7,1,10,False
361,2007,6,1,17,False


## ICD Version

In [17]:
query = '''
select year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrn_icd_version_count
from dw_staging.mcrn_claim_diag
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [18]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_mcrn_icd_version_count;', con=connection)
icd_version_df.sort_values(['year', 'icd_version'])



Unnamed: 0,year,icd_version,claim_count
0,1997,,3
5,2000,,2
29,2001,9.0,1
1,2001,,1
23,2002,,4
2,2003,,3
25,2004,9.0,4
30,2004,,6
13,2005,9.0,1
33,2006,,4
