# Data Warehouse Truven QA - Claim Diag

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [2]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [3]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

In [None]:
year_df = pd.read_sql('select distinct year from dw_staging.claim_header_1_prt_truv;', con=connection)
tables = ['ccaes', 'mdcrs', 'mdcro', 'ccaeo']



## Row Count and Claim Count

Similar to the member_enrollment_monthly table, the claim_header table has the row count should equal the number of unique claims.

For this table, we extract claim data from the s, o, and f tables from the truven schema. 


In [3]:
query = ''' drop table if exists qa_reporting.dw_truv_claim_diag_counts;
create table qa_reporting.dw_truv_claim_diag_counts
(
    data_source bpchar(4),
    table_id_src text,
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [31]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_claim_diag_counts
    (data_source, table_id_src, calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select data_source, table_id_src, year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select data_source, table_id_src, year, uth_claim_id, uth_member_id, diag_cd, icd_version
        from dw_staging.trum_claim_diag
        where diag_position = 1
    ) a
    group by 1,2,3
    '''
    
    # cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_diag_counts b
    set dw_src_clm_id_count = count
    from (
        select data_source, table_id_src, year, count(distinct claim_id_src) as count 
        from dw_staging.trum_claim_diag
        group by 1,2,3
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    and a.table_id_src = b.table_id_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_diag_counts b
    set dw_uth_mbr_id_count = count
    from (
        select data_source, table_id_src, year, count(distinct uth_member_id) as count 
        from dw_staging.trum_claim_diag
        group by 1,2,3
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    and a.table_id_src = b.table_id_src
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_claim_diag_counts b
    set dw_src_mbr_id_count = count
    from (
        select data_source, table_id_src, year, count(distinct member_id_src) as count 
        from dw_staging.trum_claim_diag
        group by 1,2,3
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    and a.table_id_src = b.table_id_src
    '''

    cursor.execute(query)

In [21]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_claim_diag_counts
    (data_source, table_id_src, calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select data_source, table_id_src, year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select data_source, table_id_src, year, uth_claim_id, uth_member_id, diag_cd, icd_version
        from dw_staging.truc_claim_diag
        where diag_position = 1
    ) a
    group by 1,2,3
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_diag_counts b
    set dw_src_clm_id_count = count
    from (
        select data_source, table_id_src, year,  count(distinct claim_id_src) as count 
        from dw_staging.truc_claim_diag
        group by 1,2,3
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    and a.table_id_src = b.table_id_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_diag_counts b
    set dw_uth_mbr_id_count = count
    from (
        select data_source, table_id_src, year, count(distinct uth_member_id) as count 
        from dw_staging.truc_claim_diag
        group by 1,2,3
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    and a.table_id_src = b.table_id_src
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_claim_diag_counts b
    set dw_src_mbr_id_count = count
    from (
        select data_source, table_id_src, year, count(distinct member_id_src) as count 
        from dw_staging.truc_claim_diag
        group by 1,2,3
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    and a.table_id_src = b.table_id_src
    '''

    cursor.execute(query)

In [35]:
# Updating raw counts, but with the combine counts of all the raw tables used
# Using the counts found during the claim header qa to avoid having to redo the counts.
with connection.cursor() as cursor:
    query = '''update qa_reporting.dw_truv_claim_diag_counts a
    set src_row_count = b.row_count,
    row_count_diff = dw_row_count - b.row_count,
    row_count_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
    src_clm_count = clm_count,
    clm_count_diff = dw_uth_clm_id_count - b.clm_count,
    clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
    src_mbr_count = b.pat_count,
    mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
    mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from qa_reporting.truven_counts b
    where a.calendar_year = b.year
    and a.table_id_src = b.table_name
    ;
    '''

    cursor.execute(query)

In [49]:
with connection.cursor() as cursor:
    query = '''
with truven_claims as (
        select 'ccaes' as table_source, year, enrolid, claim_id_derv
        from truven.ccaes
        where pdx is not null
        union 
        select 'mdcrs', year, enrolid, claim_id_derv
        from truven.mdcrs
        where pdx is not null
        union
        select 'ccaeo', year, enrolid, claim_id_derv
        from truven.ccaeo
        where dx1 is not null
        union
        select 'mdcro', year, enrolid, claim_id_derv
        from truven.mdcro
        where dx1 is not null
    ),
truven_claims_counts as (
        select table_source, year as calendar_year, count(*) as src_row_count-- count(distinct enrolid) src_mbr_count, count(distinct claim_id_derv) src_clm_count
        from truven_claims
        group by 1,2
    )
update qa_reporting.dw_truv_claim_icd_proc_counts a
set src_row_count = b.src_row_count,
row_count_diff = a.dw_row_count - b.src_row_count,
row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count
--src_clm_count = b.src_clm_count,
--clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
--clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
--src_mbr_count = b.src_mbr_count,
--mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
--mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
from truven_claims_counts b
where a.calendar_year = b.calendar_year
and a.table_id_src = b.table_source
;
    '''

    cursor.execute(query)

In [50]:
query = '''select * from qa_reporting.dw_truv_claim_diag_counts;'''

df = pd.read_sql(query, con=connection)



In [51]:
df[df['data_source'] == 'trum'].sort_values(['table_id_src', 'calendar_year'])

Unnamed: 0,data_source,table_id_src,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
46,trum,mdcro,2011,106352416,234255047,-127902631,54.599733,106352416,106353876,106577308,-224892,0.211013,4743002,4743002,4750926,-7924,0.166789,2023-07-31
47,trum,mdcro,2012,99179814,229768218,-130588404,56.834842,99179814,99189188,99450749,-270935,0.272431,4383978,4383978,4391031,-7053,0.160623,2023-07-31
45,trum,mdcro,2013,87917218,208095580,-120178362,57.751521,87917218,87922903,88058549,-141331,0.160497,3805615,3805615,3812531,-6916,0.181402,2023-07-31
44,trum,mdcro,2014,79384198,193877526,-114493328,59.054461,79384198,79390350,79544615,-160417,0.201669,3505316,3505316,3514364,-9048,0.257458,2023-07-31
39,trum,mdcro,2015,49635408,121473554,-71838146,59.138918,49635408,49652846,49736837,-101429,0.203931,2032094,2032094,2034639,-2545,0.125084,2023-07-31
41,trum,mdcro,2016,48760927,118982284,-70221357,59.01833,48760927,48775710,48879522,-118595,0.242627,1969257,1969257,1972505,-3248,0.164664,2023-07-31
42,trum,mdcro,2017,34021654,81837912,-47816258,58.428003,34021654,34020344,34120595,-98941,0.289974,1362310,1362310,1364765,-2455,0.179884,2023-07-31
30,trum,mdcro,2018,26045208,59193949,-33148741,56.000219,26045208,26044357,26105522,-60314,0.231039,1056137,1056137,1057428,-1291,0.122089,2023-07-31
33,trum,mdcro,2019,42159867,99576131,-57416264,57.66067,42159867,42173010,42280035,-120168,0.284219,1541950,1541950,1543372,-1422,0.092136,2023-07-31
34,trum,mdcro,2020,39721391,94012150,-54290759,57.748662,39721391,39722849,39967420,-246029,0.615574,1595435,1595435,1597557,-2122,0.132828,2023-07-31


In [52]:
df[df['data_source'] == 'truc'].sort_values(['table_id_src', 'calendar_year'])

Unnamed: 0,data_source,table_id_src,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
22,truc,ccaeo,2011,464767401,1077661934,-612894533,56.872616,464767401,464815367,465722367,-954966,0.20505,42474851,42474851,42511345,-36494,0.085845,2023-08-01
23,truc,ccaeo,2012,474699272,1121961205,-647261933,57.690224,474699272,474746519,475937265,-1237993,0.260117,43260503,43260503,43288123,-27620,0.063805,2023-08-01
19,truc,ccaeo,2013,373488846,875696525,-502207679,57.349511,373488846,373506774,374452293,-963447,0.257295,33633095,33633095,33659811,-26716,0.079371,2023-08-01
18,truc,ccaeo,2014,398804169,952082337,-553278168,58.112429,398804169,398822098,399893229,-1089060,0.272338,36263935,36263935,36298645,-34710,0.095623,2023-08-01
1,truc,ccaeo,2015,253677591,599918568,-346240977,57.714663,253677591,253814030,254905571,-1227980,0.481739,22276330,22276330,22319003,-42673,0.191196,2023-08-01
5,truc,ccaeo,2016,262709179,623641018,-360931839,57.874936,262709179,262821832,264225295,-1516116,0.573797,22790094,22790094,22831148,-41054,0.179816,2023-08-01
6,truc,ccaeo,2017,243462972,573221660,-329758688,57.527255,243462972,243808796,245443871,-1980899,0.807068,20906586,20906586,20936299,-29713,0.141921,2023-08-01
15,truc,ccaeo,2018,250013609,589022269,-339008660,57.554473,250013609,250028321,251365523,-1351914,0.537828,21516353,21516353,21529034,-12681,0.058902,2023-08-01
20,truc,ccaeo,2019,241349056,566058850,-324709794,57.363257,241349056,241373768,242648619,-1299563,0.535574,20268917,20268917,20280269,-11352,0.055976,2023-08-01
13,truc,ccaeo,2020,216430086,485549961,-269119875,55.425784,216430086,216451167,217834256,-1404170,0.644605,18676629,18676629,18687285,-10656,0.057023,2023-08-01


## Diagnosis Codes

Here we will check if we have valid ICD Diagnosis codes using our reference tables.

In [4]:
query = '''drop table if exists qa_reporting.dw_truv_diag_counts;
select data_source, year, diag_cd, count(*) as diag_count
into qa_reporting.dw_truv_diag_counts
from dw_staging.trum_claim_diag
group by 1,2,3;

insert into qa_reporting.dw_truv_diag_counts
select data_source, year, diag_cd, count(*) as diag_count
from dw_staging.truc_claim_diag
group by 1,2,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [5]:
diag_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_diag_counts;', con=connection)
diag_cd_df



Unnamed: 0,data_source,year,diag_cd,diag_count
0,trum,2017,S76891D,6
1,trum,2020,S139XXA,867
2,trum,2016,S0590XD,16
3,trum,2011,52189,28
4,trum,2022,M80859A,10
...,...,...,...,...
935422,truc,2018,S66922D,459
935423,truc,2022,V970XXA,7
935424,truc,2018,T85694D,4
935425,truc,2019,T4995XD,9


We see that we have a lot of claims with diagnosis codes that are not in our reference list. If we take a further look at some of these codes, we see that in our reference list, there is at least one more digit missing from these codes. The missing digits helps specify the diagnosis code.

In [6]:
query = '''
select a.*
from qa_reporting.dw_truv_diag_counts a
left join reference_tables.ref_cms_icd_cm_codes b
on a.diag_cd = cd_value
where b.cd_value is null;
'''

invalid_diag_df = pd.read_sql(query, con=connection)
invalid_diag_df



Unnamed: 0,data_source,year,diag_cd,diag_count
0,truc,2018,S0180X,28
1,truc,2017,S0232X,18
2,trum,2020,T68XX,1
3,truc,2019,W4904X,4
4,truc,2020,T849XX,4
...,...,...,...,...
7724,truc,2022,T3390X,1
7725,truc,2022,W108XX,48
7726,truc,2017,W108XX,97
7727,truc,2018,W108XX,69


Overall, the number of invalid diagnosis codes is neglible compared to the overall number of diagnosis codes in the claim_diag table.

In [7]:
invalid_diag_df.groupby('year')['diag_count'].sum()

year
2014        20
2016        12
2017     75086
2018     68611
2019     83543
2020     52658
2021     65170
2022    109465
Name: diag_count, dtype: int64

In [8]:
diag_comp_df = pd.DataFrame({'overall_diag_count': diag_cd_df.groupby('year')['diag_count'].sum(),
                            'invalid_diag_count': invalid_diag_df.groupby('year')['diag_count'].sum(),
                            'valid_diag_count': diag_cd_df.groupby('year')['diag_count'].sum() - invalid_diag_df.groupby('year')['diag_count'].sum()})
diag_comp_df.loc[diag_comp_df['valid_diag_count'].isna(),'valid_diag_count'] = diag_comp_df.loc[diag_comp_df['valid_diag_count'].isna(),'overall_diag_count']
diag_comp_df['valid_diag_count'] =  diag_comp_df['valid_diag_count'].astype(int)
diag_comp_df['invalid_to_valid_percent'] = 100. * diag_comp_df['invalid_diag_count'] / diag_comp_df['valid_diag_count']
diag_comp_df

Unnamed: 0_level_0,overall_diag_count,invalid_diag_count,valid_diag_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,1000477142,,1000477142,
2012,1042654614,,1042654614,
2013,858067733,,858067733,
2014,975855043,20.0,975855023,2e-06
2015,643804465,,643804465,
2016,688897826,12.0,688897814,2e-06
2017,622575623,75086.0,622500537,0.012062
2018,623117715,68611.0,623049104,0.011012
2019,653269525,83543.0,653185982,0.01279
2020,590330055,52658.0,590277397,0.008921


In [9]:
invalid_diag_df.groupby('year').max()

Unnamed: 0_level_0,data_source,diag_cd,diag_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,truc,7070,6
2016,trum,S335XX,3
2017,trum,Z2839,6593
2018,trum,Y33XX,4258
2019,trum,Y33XXX,4397
2020,trum,Y31XXX,3413
2021,trum,Y32XX,4004
2022,trum,Z91199,32172


## Diagnosis Code Position

In [14]:
query = '''drop table if exists qa_reporting.dw_truv_diag_position;
select data_source, year, diag_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_truv_diag_position
from dw_staging.trum_claim_diag
group by 1,2,3;

insert into qa_reporting.dw_truv_diag_position
select data_source, year, diag_position, count(distinct uth_claim_id) as claim_count
from dw_staging.truc_claim_diag
group by 1,2,3;
'''
with connection.cursor() as cursor:
    cursor.execute(query)


In [15]:
diag_position_df = pd.read_sql('select * from qa_reporting.dw_truv_diag_position;', con=connection)
diag_position_df.sort_values(['year', 'diag_position'])



Unnamed: 0,data_source,year,diag_position,claim_count
9,truc,2011,1,486670426
14,trum,2011,1,116978141
55,truc,2011,2,175586958
59,trum,2011,2,48570015
61,truc,2011,3,86822682
...,...,...,...,...
65,trum,2022,3,16003102
58,trum,2022,4,10583951
83,truc,2022,4,37048932
25,truc,2022,5,2484630


In [16]:
diag_position_df['diag_position'].unique()

array([4, 3, 1, 2, 5], dtype=int64)

Checking if the counts for the diag_position are correct. The higher the diag_position is, the less counts there should be. If we sort the counts of the diag_position by year and assign their order, this value should match with the diag_position value.

In [17]:
diag_position_df['row_rank'] = diag_position_df.sort_values(['data_source', 'year', 'claim_count'], ascending=[True, True, False]).groupby(['data_source', 'year']).cumcount()+1
diag_position_df['position_check'] = diag_position_df['row_rank'] == diag_position_df['diag_position']
diag_position_df[~diag_position_df['position_check']]

Unnamed: 0,data_source,year,diag_position,claim_count,row_rank,position_check


## ICD Version

In [18]:
query = '''drop table if exists qa_reporting.dw_truv_icd_version_count;
select data_source, year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_truv_icd_version_count
from dw_staging.trum_claim_diag
group by 1,2,3;

insert into qa_reporting.dw_truv_icd_version_count
select data_source, year, icd_version, count(distinct uth_claim_id) as claim_count
from dw_staging.truc_claim_diag
group by 1,2,3;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [19]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_truv_icd_version_count;', con=connection)
icd_version_df.sort_values(['data_source', 'year', 'icd_version'])



Unnamed: 0,data_source,year,icd_version,claim_count
51,truc,2011,0,1456
70,truc,2011,9,4184
60,truc,2011,,486715833
54,truc,2012,0,1789
53,truc,2012,9,5283
...,...,...,...,...
50,trum,2021,9,1048
1,trum,2021,,4645
15,trum,2022,0,33717968
67,trum,2022,9,26


In [21]:
icd_version_df[icd_version_df['data_source'] == 'truc'].sort_values(['year', 'icd_version'])

Unnamed: 0,data_source,year,icd_version,claim_count
51,truc,2011,0.0,1456
70,truc,2011,9.0,4184
60,truc,2011,,486715833
54,truc,2012,0.0,1789
53,truc,2012,9.0,5283
36,truc,2012,,496302657
38,truc,2013,0.0,3441
48,truc,2013,9.0,10258
20,truc,2013,,390003954
23,truc,2014,0.0,9867


In [22]:
icd_version_df[icd_version_df['data_source'] == 'trum'].sort_values(['year', 'icd_version'])

Unnamed: 0,data_source,year,icd_version,claim_count
18,trum,2011,0.0,347
7,trum,2011,9.0,970
24,trum,2011,,116980600
71,trum,2012,0.0,379
5,trum,2012,9.0,1107
45,trum,2012,,109005110
11,trum,2013,0.0,837
22,trum,2013,9.0,2865
64,trum,2013,,96481405
49,trum,2014,0.0,2957
