# Data Warehouse Medicare Texas QA - Claim ICD Proc

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [2]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [3]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [13]:
query = ''' drop table if exists qa_reporting.dw_mcrt_claim_icd_proc_counts;
create table qa_reporting.dw_mcrt_claim_icd_proc_counts
(
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [14]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrt_claim_icd_proc_counts
    (calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select distinct year, uth_claim_id, uth_member_id, proc_cd
        from dw_staging.mcrt_claim_icd_proc
        where proc_position = 1
    ) a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select year, count(distinct claim_id_src) as count 
        from dw_staging.mcrt_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, count(distinct uth_member_id) as count 
        from dw_staging.mcrt_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_mcrt_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, count(distinct member_id_src) as count 
        from dw_staging.mcrt_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

In [15]:
with connection.cursor() as cursor:
    query = '''
    with mcrt_claims as (
        select extract(year from prcdr_dt1::date) as year, bene_id, clm_id
        from medicare_texas.outpatient_base_claims_k
        where icd_prcdr_cd1 is not null
        union
        select extract(year from prcdr_dt1::date) as year, bene_id, clm_id
        from medicare_texas.inpatient_base_claims_k
        where icd_prcdr_cd1 is not null
        union
        select extract(year from prcdr_dt1::date) as year, bene_id, clm_id
        from medicare_texas.snf_base_claims_k
        where icd_prcdr_cd1 is not null
    ),
        mcrt_claims_counts as (
            select year as calendar_year, count(*) as src_row_count, count(distinct bene_id) src_mbr_count, count(distinct clm_id) src_clm_count
            from mcrt_claims
            group by 1
        )
    update qa_reporting.dw_mcrt_claim_icd_proc_counts a
    set src_row_count = b.src_row_count,
        row_count_diff = a.dw_row_count - b.src_row_count,
        row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
        src_clm_count = b.src_clm_count,
        clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
        clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
        src_mbr_count = b.src_mbr_count,
        mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
        mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from mcrt_claims_counts b
    where a.calendar_year = b.calendar_year
    ;
    '''

    cursor.execute(query)

Note: Due to the procedure dates in the Medicare data, there will be rows in the icd_proc table where 

In [16]:
query = '''select * from qa_reporting.dw_mcrt_claim_icd_proc_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values(['clm_count_percentage'], ascending=False)



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
3,2001,2,1.0,1.0,100.0,2,2,1.0,1.0,100.0,2,2,1.0,1.0,100.0,2023-12-18
9,2013,20841,6858.0,13983.0,203.893263,7587,7587,6858.0,729.0,10.629921,7574,7574,6846.0,728.0,10.633947,2023-12-18
1,2020,1196150,403678.0,792472.0,196.312903,406137,406137,403678.0,2459.0,0.609149,272038,272038,270823.0,1215.0,0.448633,2023-12-18
4,2014,1237832,475814.0,762018.0,160.150395,478227,478227,475814.0,2413.0,0.507131,315675,315675,314635.0,1040.0,0.330542,2023-12-18
5,2019,1239882,432721.0,807161.0,186.531506,434874,434874,432721.0,2153.0,0.497549,287570,287570,286642.0,928.0,0.323749,2023-12-18
0,2018,1241969,433847.0,808122.0,186.268892,435959,435959,433847.0,2112.0,0.486808,290063,290063,289140.0,923.0,0.319223,2023-12-18
2,2021,1160669,378784.0,781885.0,206.419754,380619,380619,378784.0,1835.0,0.484445,259283,259283,258213.0,1070.0,0.414387,2023-12-18
11,2015,1239448,469359.0,770089.0,164.07249,471541,471541,469359.0,2182.0,0.464889,313053,313053,312098.0,955.0,0.305994,2023-12-18
10,2016,1275146,460561.0,814585.0,176.867994,462576,462576,460561.0,2015.0,0.43751,309440,309440,308535.0,905.0,0.293322,2023-12-18
8,2017,1273102,456247.0,816855.0,179.03789,458224,458224,456247.0,1977.0,0.433318,306956,306956,306065.0,891.0,0.291115,2023-12-18


## ICD Procedure Codes

In [7]:
query = '''drop table if exists qa_reporting.dw_mcrt_icd_proc_counts;
select year, proc_cd, count(*) as proc_count
into qa_reporting.dw_mcrt_icd_proc_counts
from dw_staging.mcrt_claim_icd_proc
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [8]:
icd_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrt_icd_proc_counts;', con=connection)
icd_proc_cd_df



Unnamed: 0,year,proc_cd,proc_count
0,2021,3E0530M,1
1,2018,3E0P7VZ,40
2,2019,0TBD4ZX,1
3,2018,0J573ZZ,1
4,2016,06C93ZZ,1
...,...,...,...
119755,2021,09B1XZZ,14
119756,2018,10D00Z0,14
119757,2020,0QU60JZ,4
119758,2016,0D7P4ZZ,3


In [9]:
query = '''
select a.*
from qa_reporting.dw_mcrt_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
where b.cd_value is null;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,proc_cd,proc_count
0,2016,0000,2
1,2014,0000,1
2,2015,ORRK00Z,1
3,2017,OSRC0J9,1
4,2016,92611,2
5,2017,92611,1
6,2015,0bbb8zx,1
7,2017,OSRD0J9,1
8,2016,0HQExZZ,1
9,2018,R1310,1


In [10]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2014    1
2015    2
2016    5
2017    3
2018    1
Name: proc_count, dtype: int64

In [11]:
proc_comp_df = pd.DataFrame({'overall_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,2,,2,
2012,2,,2,
2013,22253,,22253,
2014,1293404,1.0,1293403,7.7e-05
2015,1292870,2.0,1292868,0.000155
2016,1315006,5.0,1315001,0.00038
2017,1319419,3.0,1319416,0.000227
2018,1327947,1.0,1327946,7.5e-05
2019,1326569,,1326569,
2020,1275305,,1275305,


In [12]:
invalid_proc_df.groupby('year').max()

Unnamed: 0_level_0,proc_cd,proc_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,0000,1
2015,ORRK00Z,1
2016,92611,2
2017,OSRD0J9,1
2018,R1310,1


## Procedure Code Position

In [13]:
query = '''drop table if exists qa_reporting.dw_mcrt_proc_position;
select year, proc_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrt_proc_position
from dw_staging.mcrt_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)


In [14]:
proc_position_df = pd.read_sql('select * from qa_reporting.dw_mcrt_proc_position;', con=connection)
proc_position_df.sort_values(['year', 'proc_position'])



Unnamed: 0,year,proc_position,claim_count
178,2001,1,1
14,2001,2,1
28,2012,1,1
51,2012,2,1
181,2013,1,6858
...,...,...,...
162,2022,9,1
208,2022,18,1
39,2022,19,1
168,2022,22,1


In [15]:
proc_position_df['proc_position'].unique()

array([ 7,  5, 24,  8, 19, 11, 13, 15, 20, 10, 25,  4,  2,  1, 21,  3, 23,
       14, 17,  9, 16,  6, 22, 18, 12], dtype=int64)

Checking if the counts for the proc_position are correct. The higher the proc_position is, the less counts there should be. If we sort the counts of the proc_position by year and assign their order, this value should match with the proc_position value.

In [16]:
proc_position_df['row_rank'] = proc_position_df.sort_values(['year', 'claim_count'], ascending=[True, False]).groupby(['year']).cumcount()+1
proc_position_df['position_check'] = proc_position_df['row_rank'] == proc_position_df['proc_position']
proc_position_df[~proc_position_df['position_check']]

Unnamed: 0,year,proc_position,claim_count,row_rank,position_check
14,2001,2,1,1,False
34,2022,23,1,1,False
39,2022,19,1,2,False
60,2013,22,21,23,False
75,2022,2,1,3,False
77,2013,23,24,22,False
162,2022,9,1,4,False
168,2022,22,1,5,False
178,2001,1,1,2,False
190,2022,8,1,6,False


## ICD Version

In [17]:
query = '''drop table if exists qa_reporting.dw_mcrt_icd_proc_version_count;
select year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_mcrt_icd_proc_version_count
from dw_staging.mcrt_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [18]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_mcrt_icd_proc_version_count;', con=connection)
icd_version_df.sort_values(['year', 'icd_version'])



Unnamed: 0,year,icd_version,claim_count
8,2001,,2
1,2012,,1
9,2013,,7587
3,2014,,478227
4,2015,,471541
2,2016,,462576
11,2017,,458224
5,2018,,435959
7,2019,,434874
6,2020,,406137
