# Data Warehouse Truven QA - Claim ICD Proc

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

Similar to the member_enrollment_monthly table, the claim_header table has the row count should equal the number of unique claims.

For this table, we extract claim data from the s, o, and f tables from the truven schema. 


In [5]:
query = ''' drop table if exists qa_reporting.dw_truv_claim_icd_proc_counts;
create table qa_reporting.dw_truv_claim_icd_proc_counts
(
    data_source bpchar(4),
    table_id_src text,
    calendar_year int,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [15]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_claim_icd_proc_counts
    (data_source, calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select data_source, year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select data_source, year, uth_claim_id, uth_member_id
        from dw_staging.trum_claim_icd_proc
        where proc_position = 1
    ) a
    group by 1,2
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select data_source, year,  count(distinct claim_id_src) as count 
        from dw_staging.trum_claim_icd_proc
        group by 1,2
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    ;
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select data_source, year, count(distinct uth_member_id) as count 
        from dw_staging.trum_claim_icd_proc
        group by 1,2
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    ;
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select data_source, year, count(distinct member_id_src) as count 
        from dw_staging.trum_claim_icd_proc
        group by 1,2
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    ;
    '''

    cursor.execute(query)

In [16]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_truv_claim_icd_proc_counts
    (data_source, calendar_year, dw_row_count, dw_uth_clm_id_count, date_generated)
    select data_source, year, count(a.*), count(distinct uth_claim_id), current_date
    from (
        select data_source, year, uth_claim_id, uth_member_id
        from dw_staging.truc_claim_icd_proc
        where proc_position = 1
    ) a
    group by 1,2
    '''
    
    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_src_clm_id_count = count
    from (
        select data_source, year,  count(distinct claim_id_src) as count 
        from dw_staging.truc_claim_icd_proc
        group by 1,2
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    ;
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_uth_mbr_id_count = count
    from (
        select data_source, year, count(distinct uth_member_id) as count 
        from dw_staging.truc_claim_icd_proc
        group by 1,2
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    ;
    '''

    cursor.execute(query)
    
    query = f'''
    update qa_reporting.dw_truv_claim_icd_proc_counts b
    set dw_src_mbr_id_count = count
    from (
        select data_source, year, count(distinct member_id_src) as count 
        from dw_staging.truc_claim_icd_proc
        group by 1,2
    ) a
    where a.year = b.calendar_year
    and a.data_source = b.data_source
    ;
    '''

    cursor.execute(query)

claims that are both in the s table and f tables may have the same procedure code. need to consider this

In [20]:
with connection.cursor() as cursor:
    query = '''
with truven_claims as (
        select 'truc' as table_source, year, enrolid, claim_id_derv, svcdate
        from truven.ccaes
        where pproc is not null
        union
        select 'trum', year, enrolid, claim_id_derv, svcdate
        from truven.mdcrs
        where pproc is not null
        union
        select 'truc', year, enrolid, claim_id_derv, svcdate
        from truven.ccaef
        where proc1 is not null
        or proc2 is not null
        or proc3 is not null
        or proc4 is not null
        or proc5 is not null
        or proc6 is not null
        union
        select 'trum', year, enrolid, claim_id_derv, svcdate
        from truven.mdcrf
        where proc1 is not null
        or proc2 is not null
        or proc3 is not null
        or proc4 is not null
        or proc5 is not null
        or proc6 is not null
    ),
truven_claims_counts as (
        select table_source, year as calendar_year, count(*) as src_row_count, count(distinct enrolid) src_mbr_count, count(distinct claim_id_derv) src_clm_count
        from truven_claims
        group by 1,2
    )
update qa_reporting.dw_truv_claim_icd_proc_counts a
set src_row_count = b.src_row_count,
row_count_diff = a.dw_row_count - b.src_row_count,
row_count_percentage = 100. * abs(a.dw_row_count - b.src_row_count) / b.src_row_count,
src_clm_count = b.src_clm_count,
clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
src_mbr_count = b.src_mbr_count,
mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,   
mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
from truven_claims_counts b
where a.calendar_year = b.calendar_year
and a.data_source = b.table_source
and a.table_id_src is null
;
    '''

    cursor.execute(query)

In [24]:
query = '''select * from qa_reporting.dw_truv_claim_icd_proc_counts;'''

df = pd.read_sql(query, con=connection)
# df.sort_values(['clm_count_percentage'], ascending=False)



In [25]:
df[df['data_source'] == 'truc'].sort_values(['table_id_src', 'calendar_year'])

Unnamed: 0,data_source,table_id_src,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
11,truc,,2011,16938600,16427214,511386,3.113042,13842249,13856813,13856813,-14564,0.105104,3161890,3163117,3163117,-1227,0.038791,2023-08-01
10,truc,,2012,15493783,15110062,383721,2.539506,12734387,12738845,12738845,-4458,0.034995,2616806,2617234,2617234,-428,0.016353,2023-08-01
8,truc,,2013,12447289,12191354,255935,2.099316,10498685,10505683,10505683,-6998,0.066612,2345888,2346343,2346343,-455,0.019392,2023-08-01
9,truc,,2014,12543187,12403430,139757,1.126761,10572836,10647806,10647806,-74970,0.704089,2239190,2239892,2239892,-702,0.031341,2023-08-01
7,truc,,2015,8826563,8758549,68014,0.776544,7402401,7479786,7479786,-77385,1.034588,1413591,1414024,1414024,-433,0.030622,2023-08-01
6,truc,,2016,10800247,10613136,187111,1.763013,8769776,8775166,8775166,-5390,0.061423,1052074,1052877,1052877,-803,0.076267,2023-08-01
5,truc,,2017,9751967,9590565,161402,1.682925,8001915,8007245,8007245,-5330,0.066565,902716,903503,903503,-787,0.087105,2023-08-01
3,truc,,2018,9991589,9838320,153269,1.557878,8189515,8189689,8189689,-174,0.002125,844139,844559,844559,-420,0.04973,2023-08-01
4,truc,,2019,9474557,9330760,143797,1.541107,7762404,7762081,7762081,323,0.004161,803827,804114,804114,-287,0.035691,2023-08-01
2,truc,,2020,8868726,8722230,146496,1.67957,7234432,7234220,7234220,212,0.002931,713490,713788,713788,-298,0.041749,2023-08-01


In [26]:
df[df['data_source'] == 'trum'].sort_values(['table_id_src', 'calendar_year'])

Unnamed: 0,data_source,table_id_src,calendar_year,dw_row_count,src_row_count,row_count_diff,row_count_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
22,trum,,2011,6869139,6792563,76576,1.127351,5153121,5155807,5155807,-2686,0.052097,674041,674400,674400,-359,0.053233,2023-08-01
21,trum,,2012,6389031,6304917,84114,1.334102,4871518,4873037,4873037,-1519,0.031172,641996,642216,642216,-220,0.034256,2023-08-01
23,trum,,2013,5600819,5514188,86631,1.571056,4366754,4370012,4370012,-3258,0.074554,580232,580550,580550,-318,0.054776,2023-08-01
20,trum,,2014,4982278,4928132,54146,1.098712,3890784,3907119,3907119,-16335,0.418083,499101,499352,499352,-251,0.050265,2023-08-01
19,trum,,2015,3607357,3571995,35362,0.989979,2823690,2835039,2835039,-11349,0.400312,329893,329944,329944,-51,0.015457,2023-08-01
18,trum,,2016,4202986,4159888,43098,1.036038,3272808,3272796,3272796,12,0.000367,257008,257095,257095,-87,0.03384,2023-08-01
17,trum,,2017,2954197,2925553,28644,0.979097,2357664,2357313,2357313,351,0.01489,162026,162098,162098,-72,0.044418,2023-08-01
12,trum,,2018,1978993,1962662,16331,0.832084,1608103,1607341,1607341,762,0.047407,98184,98200,98200,-16,0.016293,2023-08-01
15,trum,,2019,3821341,3791337,30004,0.791383,3010748,3010791,3010791,-43,0.001428,168943,168958,168958,-15,0.008878,2023-08-01
14,trum,,2020,3981899,3761230,220669,5.866937,2964682,2964897,2964897,-215,0.007252,159023,159054,159054,-31,0.01949,2023-08-01


## ICD Procedure Codes

In [28]:
query = '''drop table if exists qa_reporting.dw_truv_icd_proc_counts;

select data_source, year, proc_cd, count(*) as proc_count
into qa_reporting.dw_truv_icd_proc_counts
from dw_staging.trum_claim_icd_proc
group by 1,2,3;

insert into qa_reporting.dw_truv_icd_proc_counts
select data_source, year, proc_cd, count(*) as proc_count
from dw_staging.truc_claim_icd_proc
group by 1,2,3;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [29]:
icd_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_icd_proc_counts;', con=connection)
icd_proc_cd_df



Unnamed: 0,data_source,year,proc_cd,proc_count
0,trum,2017,03LH3DZ,15
1,trum,2015,0D5C8ZZ,33
2,trum,2022,0SRB01Z,54
3,trum,2019,0DSP4ZZ,120
4,trum,2011,7851,502
...,...,...,...,...
291630,truc,2018,4A033B3,28
291631,truc,2021,0HST0ZZ,5
291632,truc,2017,B241ZZ4,47
291633,truc,2015,0F790ZZ,1


In [30]:
query = '''
select a.*
from qa_reporting.dw_truv_icd_proc_counts a
left join reference_tables.ref_cms_icd_pcs_codes b
on a.proc_cd = cd_value
where b.cd_value is null;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,data_source,year,proc_cd,proc_count
0,truc,2012,J7511,3
1,truc,2012,70548,1
2,trum,2020,63045,3
3,trum,2020,45391,2
4,truc,2020,88342,10
...,...,...,...,...
16446,truc,2013,92937,1
16447,truc,2012,31505,2
16448,truc,2012,A4385,1
16449,truc,2011,J7609,7


In [32]:
invalid_proc_df.groupby(['data_source', 'year'])['proc_count'].sum()

data_source  year
truc         2011    444830
             2012    254364
             2013     74004
             2014     32851
             2015     32275
             2016      2052
             2017       171
             2018        88
             2019        66
             2020     16815
             2021        88
             2022       189
trum         2011      1235
             2012     12854
             2013      2817
             2014      9815
             2015      6942
             2016        21
             2017         3
             2018         9
             2019         1
             2020    295011
             2021         1
             2022        75
Name: proc_count, dtype: int64

In [34]:
proc_comp_df = pd.DataFrame({'overall_proc_count': icd_proc_cd_df.groupby(['data_source', 'year'])['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby(['data_source', 'year'])['proc_count'].sum(),
                            'valid_proc_count': icd_proc_cd_df.groupby(['data_source', 'year'])['proc_count'].sum() - invalid_proc_df.groupby(['data_source', 'year'])['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,Unnamed: 1_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
data_source,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
truc,2011,19764426,444830,19319596,2.302481
truc,2012,18146219,254364,17891855,1.421675
truc,2013,14713996,74004,14639992,0.505492
truc,2014,15475067,32851,15442216,0.212735
truc,2015,10780067,32275,10747792,0.300294
truc,2016,12245130,2052,12243078,0.01676
truc,2017,11088520,171,11088349,0.001542
truc,2018,11549700,88,11549612,0.000762
truc,2019,10979503,66,10979437,0.000601
truc,2020,10299662,16815,10282847,0.163525


In [36]:
invalid_proc_df.groupby(['data_source', 'year']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,proc_cd,proc_count
data_source,year,Unnamed: 2_level_1,Unnamed: 3_level_1
truc,2011,V5275,22072
truc,2012,V5299,14349
truc,2013,V2744,3325
truc,2014,V2632,16975
truc,2015,S5001,17055
truc,2016,S5001,261
truc,2017,L8699,54
truc,2018,Q0144,18
truc,2019,J7999,7
truc,2020,V5257,1008


## Procedure Code Position

In [39]:
query = '''drop table if exists qa_reporting.dw_truv_proc_position;
select data_source, year, proc_position, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_truv_proc_position
from dw_staging.trum_claim_icd_proc
group by 1,2,3;

insert into qa_reporting.dw_truv_proc_position
select data_source, year, proc_position, count(distinct uth_claim_id) as claim_count
from dw_staging.truc_claim_icd_proc
group by 1,2,3;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [40]:
proc_position_df = pd.read_sql('select * from qa_reporting.dw_truv_proc_position;', con=connection)
proc_position_df.sort_values(['data_source', 'year', 'proc_position'])



Unnamed: 0,data_source,year,proc_position,claim_count
48,truc,2011,1,13842249
95,truc,2011,2,1348799
66,truc,2011,3,687143
132,truc,2011,4,338345
85,truc,2011,5,199769
...,...,...,...,...
123,trum,2022,2,77594
54,trum,2022,3,52385
7,trum,2022,4,34845
52,trum,2022,5,24007


In [42]:
sorted(proc_position_df['proc_position'].unique())

[1, 2, 3, 4, 5, 6]

Checking if the counts for the proc_position are correct. The higher the proc_position is, the less counts there should be. If we sort the counts of the proc_position by year and assign their order, this value should match with the proc_position value.

In [44]:
proc_position_df['row_rank'] = proc_position_df.sort_values(['data_source', 'year', 'claim_count'], ascending=[True, True, False]).groupby(['data_source', 'year']).cumcount()+1
proc_position_df['position_check'] = proc_position_df['row_rank'] == proc_position_df['proc_position']
proc_position_df[~proc_position_df['position_check']]

Unnamed: 0,data_source,year,proc_position,claim_count,row_rank,position_check


## ICD Version

In [45]:
query = '''drop table if exists qa_reporting.dw_truv_icd_proc_version_count;
select data_source, year, icd_version, count(distinct uth_claim_id) as claim_count
into qa_reporting.dw_truv_icd_proc_version_count
from dw_staging.trum_claim_icd_proc
group by 1,2,3;

insert into qa_reporting.dw_truv_icd_proc_version_count
select data_source, year, icd_version, count(distinct uth_claim_id) as claim_count
from dw_staging.truc_claim_icd_proc
group by 1,2,3;
'''
with connection.cursor() as cursor:
    cursor.execute(query)

In [49]:
icd_version_df = pd.read_sql('select * from qa_reporting.dw_truv_icd_proc_version_count;', con=connection)
icd_version_df.sort_values(['data_source', 'year', 'icd_version'])



Unnamed: 0,data_source,year,icd_version,claim_count
24,truc,2011,,13853100
35,truc,2012,,12739100
49,truc,2013,,10506175
30,truc,2014,,10647161
16,truc,2015,0.0,2085214
8,truc,2015,9.0,3664339
41,truc,2015,,2160201
43,truc,2016,0.0,8353131
51,truc,2016,9.0,1243
36,truc,2016,,1052778
