# Data Warehouse Truven QA - Claim ICD Proc

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [2]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [3]:
connection = psycopg2.connect(get_dsn())
connection.autocommit = True

## Row Count and Claim Count

Similar to the member_enrollment_monthly table, the claim_header table has the row count should equal the number of unique claims.

For this table, we extract claim data from the s, o, and f tables from the truven schema. 


In [5]:
query = ''' drop table if exists qa_reporting.dw_truv_claim_icd_proc_counts;
create table qa_reporting.dw_truv_claim_icd_proc_counts
(
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [6]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_claim_icd_proc_counts
    (calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select year, count(a.*), count(distinct uth_claim_id), current_date
    from (select year, uth_claim_id, uth_member_id, proc_cd, icd_version
    from dw_staging.truv_claim_icd_proc
    where proc_position = 1) a
    group by 1
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select year,  count(distinct claim_id_src) as count 
        from dw_staging.truv_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, count(distinct uth_member_id) as count 
        from dw_staging.truv_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select year, count(distinct member_id_src) as count 
        from dw_staging.truv_claim_icd_proc
    group by 1) a
    where a.year = b.calendar_year
    '''

    cursor.execute(query)

claims that are both in the s table and f tables may have the same procedure code. need to consider this

In [9]:
with connection.cursor() as cursor:
    query = '''
    with truven_claims as (
            select year, enrolid, claim_id_derv, pproc, svcdate, dxver
            from truven.ccaes
            where pproc is not null
            union
            select year, enrolid, claim_id_derv, pproc, svcdate, dxver
            from truven.mdcrs
            where pproc is not null
            union
            select year, enrolid, claim_id_derv, proc1, svcdate, dxver
            from truven.ccaef
            where proc1 is not null
            union
            select year, enrolid, claim_id_derv, proc1, svcdate, dxver
            from truven.mdcrf
            where proc1 is not null
        ),
        truven_claims_counts as (
            select year as calendar_year, count(*) as src_row_count, count(distinct enrolid) src_mbr_count, count(distinct claim_id_derv) src_clm_count
            from truven_claims
            group by 1
        )
    update qa_reporting.dw_truv_claim_icd_proc_counts a
    set src_row_count = b.src_row_count,
    row_count_diff = a.dw_row_count - b.src_row_count,
    row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
    src_clm_count = b.src_clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
    src_mbr_count = b.src_mbr_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from truven_claims_counts b
    where a.calendar_year = b.calendar_year
    ;
    '''

    cursor.execute(query)

In [10]:
query = '''select * from qa_reporting.dw_truv_claim_icd_proc_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values(['clm_count_percentage'], ascending=False)



Unnamed: 0,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
7,2022,5456704,5482624,-25920,0.472766,4099918,4100153,4115100,-15182,0.368934,394118,394118,396521,-2403,0.606021,2023-05-11
6,2017,13778188,13779323,-1135,0.008237,10368432,10364554,10363856,4576,0.044153,1065162,1065162,1064636,526,0.049407,2023-05-11
1,2018,13079818,13080375,-557,0.004258,9800585,9797026,9796306,4279,0.04368,942503,942503,941906,597,0.063382,2023-05-11
4,2019,14415094,14415697,-603,0.004183,10775946,10772868,10772195,3751,0.034821,972650,972650,972100,550,0.056579,2023-05-11
10,2016,16270903,16273488,-2585,0.015885,12051092,12047951,12047219,3873,0.032148,1309161,1309161,1308635,526,0.040195,2023-05-11
2,2020,13831513,13834499,-2986,0.021584,10201312,10199018,10198377,2935,0.028779,872493,872493,871886,607,0.069619,2023-05-11
11,2015,13203171,13204258,-1087,0.008232,10229089,10314808,10226347,2742,0.026813,1742313,1742313,1699426,42887,2.523617,2023-05-11
3,2014,17525590,17524424,1166,0.006654,14468560,14554908,14464928,3632,0.025109,2736918,2736918,2695571,41347,1.533887,2023-05-11
5,2013,18048149,18047756,393,0.002178,14869125,14875679,14865910,3215,0.021627,2923813,2923813,2918406,5407,0.185272,2023-05-11
8,2012,21882896,21883529,-633,0.002893,17609356,17611826,17606240,3116,0.017698,3256545,3256545,3254285,2260,0.069447,2023-05-11


## ICD Procedure Codes

In [12]:
query = '''drop table if exists qa_reporting.dw_truv_icd_proc_counts;
select year, proc_cd, count(*) as proc_count
into qa_reporting.dw_truv_icd_proc_counts
from dw_staging.truv_claim_icd_proc
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
icd_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_icd_proc_counts;', con=connection)
icd_proc_cd_df



Unnamed: 0,year,proc_cd,proc_count
0,2015,0RJG4ZZ,54
1,2014,3639,86
2,2015,0BPQX0Z,67
3,2020,C1750,10
4,2017,0KXK4ZZ,3
...,...,...,...
195400,2017,04HF33Z,361
195401,2022,03H333Z,13
195402,2012,95909,2
195403,2017,00QS0ZZ,1


In [14]:
query = '''
select a.*
from qa_reporting.dw_truv_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
where b.cd_value is null;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,proc_cd,proc_count
0,2011,25605,19
1,2017,99213,1
2,2011,46606,1
3,2012,88329,3
4,2011,49411,1
...,...,...,...
14080,2013,93325,31
14081,2013,76881,4
14082,2020,J3111,24
14083,2011,92582,4


In [15]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2011    446065
2012    267218
2013     76821
2014     42681
2015     39218
2016      2073
2017       174
2018        97
2019        67
2020    312065
2021        89
2022       121
Name: proc_count, dtype: int64

In [9]:
proc_comp_df = pd.DataFrame({'overall_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': icd_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,27349490,446065,26903425,1.658023
2012,25232138,267218,24964920,1.070374
2013,21008291,76821,20931470,0.367012
2014,21201709,42681,21159028,0.201715
2015,15626065,39218,15586847,0.25161
2016,18118930,2073,18116857,0.011442
2017,15352013,174,15351839,0.001133
2018,14786714,97,14786617,0.000656
2019,16244242,67,16244175,0.000412
2020,15578196,312065,15266131,2.044166


In [11]:
invalid_proc_df.groupby('year').max()

Unnamed: 0_level_0,proc_cd,proc_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2011,V5275,22100
2012,V5299,14811
2013,V2787,3326
2014,V2787,21456
2015,V2632,20303
2016,S5001,264
2017,L8699,55
2018,Q0144,18
2019,J7999,7
2020,V5298,13137


## Procedure Code Position

In [12]:
query = '''drop table if exists qa_reporting.dw_truv_proc_position;
select year, proc_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_truv_proc_position
from dw_staging.claim_icd_proc_1_prt_truv
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)


In [21]:
proc_position_df = pd.read_sql('select * from qa_reporting.dw_truv_proc_position;', con=connection)
proc_position_df.sort_values(['year', 'proc_position'])

Unnamed: 0,year,proc_position,claim_count
32,2011,1,19002734
0,2011,2,1657859
24,2011,3,855170
48,2011,4,432797
8,2011,5,266337
...,...,...,...
31,2022,2,266342
37,2022,3,178910
59,2022,4,112476
44,2022,5,73820


In [26]:
proc_position_df['proc_position'].unique()

array([2, 6, 4, 1, 5, 3], dtype=int64)

Checking if the counts for the proc_position are correct. The higher the proc_position is, the less counts there should be. If we sort the counts of the proc_position by year and assign their order, this value should match with the proc_position value.

In [25]:
proc_position_df['row_rank'] = proc_position_df.sort_values(['year', 'claim_count'], ascending=[True, False]).groupby(['year']).cumcount()+1
proc_position_df['position_check'] = proc_position_df['row_rank'] == proc_position_df['proc_position']
proc_position_df[~proc_position_df['position_check']]

Unnamed: 0,year,proc_position,claim_count,row_rank,position_check


## ICD Version

In [16]:
query = '''drop table if exists qa_reporting.dw_truv_icd_proc_version_count;
select year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_truv_icd_proc_version_count
from dw_staging.truv_claim_icd_proc
group by 1,2;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [17]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_truv_icd_version_count;', con=connection)
icd_version_df.sort_values(['year', 'icd_version'])



Unnamed: 0,year,icd_version,claim_count
28,2011,0.0,1802
25,2011,9.0,5154
27,2011,,603846894
33,2012,0.0,2169
30,2012,9.0,6408
26,2012,,605415909
3,2013,0.0,4278
7,2013,9.0,13140
9,2013,,486609012
12,2014,0.0,12835
