# Data Warehouse Truven QA - Claim Detail

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

In [9]:
year_df = pd.read_sql('select distinct year from dw_staging.trum_claim_detail;', con=connection)
tables = ['ccaes', 'mdcrs', 'mdcro', 'ccaeo', 'ccaef', 'mdcrf']

In [11]:
for table in tables:
    with connection.cursor() as cursor:
        query = f'''
        update qa_reporting.truven_counts a
        set clm_count = count
        from (
            select '{table}' as table_name, year, count(distinct claim_id_derv)
            from truven.{table}
            group by 2
        ) b
        where a.year = b.year
        and a.table_name = b.table_name;'''

        cursor.execute(query)

## Row Count and Claim Count

Similar to the member_enrollment_monthly table, the claim_detail table has the row count should equal the number of unique claims.

For this table, we extract claim data from the s, o, and f tables from the truven schema. 


In [4]:
query = ''' drop table if exists qa_reporting.dw_truv_claim_detail_counts;
create table qa_reporting.dw_truv_claim_detail_counts
(
    data_source bpchar(4),
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    for year in tqdm(year_df['year']):
        query = f'''
        insert into qa_reporting.dw_truv_claim_detail_counts
        (data_source, calendar_year, table_src, dw_row_count, date_generated)
        select data_source, year, table_id_src, count(*), current_date
        from dw_staging.truc_claim_detail
        where year = {year}
        group by 1, 2, 3
        '''
        
        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_clm_id_count = count
        from (
            select data_source, year, table_id_src,  count(distinct uth_claim_id) as count 
            from dw_staging.truc_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_clm_id_count = count
        from (
            select data_source, year, table_id_src,  count(distinct claim_id_src) as count 
            from dw_staging.truc_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_mbr_id_count = count
        from (
            select data_source, year, table_id_src,  count(distinct uth_member_id) as count 
            from dw_staging.truc_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)
        
        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_mbr_id_count = count
        from (
            select data_source, year, table_id_src, count(distinct member_id_src) as count 
            from dw_staging.truc_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

100%|██████████| 12/12 [2:42:49<00:00, 814.13s/it] 


In [6]:
with connection.cursor() as cursor:
    for year in tqdm(year_df['year']):
        query = f'''
        insert into qa_reporting.dw_truv_claim_detail_counts
        (data_source, calendar_year, table_src, dw_row_count, date_generated)
        select data_source, year, table_id_src, count(*), current_date
        from dw_staging.trum_claim_detail
        where year = {year}
        group by 1, 2, 3
        '''
        
        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_clm_id_count = count
        from (
            select data_source, year, table_id_src,  count(distinct uth_claim_id) as count 
            from dw_staging.trum_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_clm_id_count = count
        from (
            select data_source, year, table_id_src,  count(distinct claim_id_src) as count 
            from dw_staging.trum_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_mbr_id_count = count
        from (
            select data_source, year, table_id_src,  count(distinct uth_member_id) as count 
            from dw_staging.trum_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)
        
        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_mbr_id_count = count
        from (
            select data_source, year, table_id_src,   count(distinct member_id_src) as count 
            from dw_staging.trum_claim_detail
        where year = {year}
        group by 1, 2, 3) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

100%|██████████| 12/12 [23:18<00:00, 116.51s/it]


In [12]:
with connection.cursor() as cursor:
    query = '''update qa_reporting.dw_truv_claim_detail_counts a
    set src_row_count = b.row_count,
    row_count_diff = dw_row_count - b.row_count,
    row_count_diff_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
    src_clm_count = clm_count,
    clm_count_diff = dw_uth_clm_id_count - b.clm_count,
    clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
    src_mbr_count = b.pat_count,
    mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
    mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from qa_reporting.truven_counts b
    where a.calendar_year = b.year
    and a.table_src = b.table_name
    ;
    '''

    cursor.execute(query)

In [13]:
query = '''select * from qa_reporting.dw_truv_claim_detail_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values('calendar_year')



Unnamed: 0,data_source,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
35,trum,2011,mdcro,234159725,234255047,-95322,0.040692,106545093,106542800,106577308,-32215,0.030227,4743651,4743651,4750926,-7275,0.153128,2023-10-11
33,trum,2011,mdcrs,32522421,32535953,-13532,0.041591,10833438,10831969,10836750,-3312,0.030563,794174,794174,794571,-397,0.049964,2023-10-11
12,truc,2011,ccaes,81538719,81640170,-101451,0.124266,22335352,22332253,22342714,-7362,0.03295,2347911,2347911,2348969,-1058,0.045041,2023-10-11
14,truc,2011,ccaeo,1076951968,1077661934,-709966,0.06588,465618813,465615400,465722367,-103554,0.022235,42488518,42488518,42511345,-22827,0.053696,2023-10-11
34,trum,2012,mdcro,229684913,229768218,-83305,0.036256,99423240,99421263,99450749,-27509,0.027661,4384491,4384491,4391031,-6540,0.14894,2023-10-11
32,trum,2012,mdcrs,32194159,32200251,-6092,0.018919,10013948,10011917,10014407,-459,0.004583,714025,714025,714226,-201,0.028142,2023-10-11
13,truc,2012,ccaes,82150319,82224989,-74670,0.090812,21970001,21965821,21970624,-623,0.002836,2288983,2288983,2289517,-534,0.023324,2023-10-11
15,truc,2012,ccaeo,1121364740,1121961205,-596465,0.053163,475864303,475860103,475937265,-72962,0.01533,43278501,43278501,43288123,-9622,0.022228,2023-10-11
26,trum,2013,mdcrs,29042250,29045752,-3502,0.012057,8710576,8708695,8711803,-1227,0.014084,599538,599538,599773,-235,0.039181,2023-10-11
2,truc,2013,ccaes,62557723,62632638,-74915,0.11961,16810242,16806096,16811236,-994,0.005913,1720254,1720254,1720897,-643,0.037364,2023-10-11


In [14]:
df.sort_values('row_count_diff_percentage', ascending=False)[['calendar_year', 'table_src', 'row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage']]

Unnamed: 0,calendar_year,table_src,row_count_diff_percentage,clm_count_percentage,mbr_count_percentage
16,2022,ccaes,0.41257,0.279059,0.630042
0,2017,ccaes,0.262478,0.059504,0.112192
4,2016,ccaes,0.232982,0.057172,0.106096
18,2015,ccaes,0.185213,0.012971,0.048973
6,2018,ccaes,0.181264,0.001348,0.085264
1,2017,ccaeo,0.158902,0.069847,0.072133
8,2020,ccaes,0.152118,0.019529,0.055727
10,2019,ccaes,0.149153,0.014777,0.046905
5,2016,ccaeo,0.144856,0.065737,0.06881
17,2022,ccaeo,0.141785,0.04107,0.077511


## Place of Service

In [15]:
query = '''drop table if exists qa_reporting.dw_truv_pos;
select data_source, year, place_of_service, count(*)
into qa_reporting.dw_truv_pos
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_pos
select data_source, year, place_of_service, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [16]:
pos_df = pd.read_sql('select * from qa_reporting.dw_truv_pos;', con=connection)
pos_df



Unnamed: 0,data_source,year,place_of_service,count
0,trum,2018,11,26532979
1,trum,2013,53,6619
2,trum,2014,16,13
3,trum,2022,20,530700
4,truc,2022,55,38675
...,...,...,...,...
1078,truc,2013,71,301113
1079,truc,2011,23,30004363
1080,truc,2014,3.,42014
1081,truc,2021,11,259078094


In [17]:
pos_df['place_of_service'].unique()

array(['11', '53', '16', '20', '55', '31', '50', '2.', '12', '62', '19',
       '49', '1', '18', '35', '54', '56', '22', '32', '95', '51', '71',
       '25', '33', '24', '99', '52', None, '60', '1.', '81', '42', '34',
       '41', '4', '4.', '15', '61', '14', '13', '72', '65', '3.', '21',
       '57', '17', '3', '26', '2', '23', '58', '10', '27', '28'],
      dtype=object)

In [18]:
query = '''
select a.* 
from qa_reporting.dw_truv_pos a
left join reference_tables.ref_place_of_service b
on a.place_of_service = b.place_of_treatment_cd
where b.place_of_treatment_cd is null
and a.place_of_service is not null
;
'''
pos_df = pd.read_sql(query, con=connection)
pos_df.sort_values(['year', 'place_of_service'])



Unnamed: 0,data_source,year,place_of_service,count
49,trum,2011,1.,182179
58,truc,2011,1.,235271
78,trum,2011,3.,1137
79,truc,2011,3.,38641
12,truc,2011,4.,287
...,...,...,...,...
23,truc,2022,2,10323231
62,trum,2022,3,1018
64,truc,2022,3,176403
40,truc,2022,4,625


In [19]:
pos_df.groupby(['data_source', 'year']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
data_source,year,Unnamed: 2_level_1
truc,2011,274199
truc,2012,373603
truc,2013,241795
truc,2014,353861
truc,2015,234087
truc,2016,253920
truc,2017,201004
truc,2018,289314
truc,2019,524434
truc,2020,11384134


In [20]:
pd.read_sql('select distinct stdplac from truven.mdcrs;', con=connection)



Unnamed: 0,stdplac
0,
1,52.0
2,12.0
3,1.0
4,49.0
5,19.0
6,99.0
7,23.0
8,21.0
9,51.0


In [21]:
pd.read_sql('select distinct stdplac from truven.mdcro;', con=connection)



Unnamed: 0,stdplac
0,
1,12.0
2,21.0
3,51.0
4,52.0
5,1.0
6,49.0
7,13.0
8,19.0
9,57.0


## Revenue Code

Looking at how many claims have invalid revenue codes.

In [22]:
query = '''drop table if exists qa_reporting.dw_truv_revenue_cd;
select data_source, year, revenue_cd, count(*)
into qa_reporting.dw_truv_revenue_cd
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_pos
select data_source, year, revenue_cd, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [23]:
query = '''
select a.* 
from qa_reporting.dw_truv_revenue_cd a
left join reference_tables.ref_revenue_code b
on a.revenue_cd = b.revenue_cd
where b.revenue_cd is null
and a.revenue_cd is not null
;
'''
invalid_rev_cd_df = pd.read_sql(query, con=connection)
invalid_rev_cd_df.sort_values(['year', 'revenue_cd'])



Unnamed: 0,data_source,year,revenue_cd,count
945,trum,2011,0002,15
340,trum,2011,0003,2
732,trum,2011,0004,2
1124,trum,2011,0005,1
767,trum,2011,0006,1
...,...,...,...,...
560,trum,2022,7386,1
982,trum,2022,7535,1
849,trum,2022,9213,1
1036,trum,2022,9900,1


In [24]:
pd.DataFrame(invalid_rev_cd_df['revenue_cd'].unique())

Unnamed: 0,0
0,0768
1,0486
2,9500
3,0847
4,7110
...,...
539,1020
540,0417
541,6413
542,0411


In [25]:
invalid_rev_cd_sum = invalid_rev_cd_df.groupby('year').sum()
invalid_rev_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2011,66442
2012,69576
2013,68633
2014,66143
2015,44159
2016,40535
2017,25206
2018,14807
2019,26127
2020,13204


In [26]:
rev_cd_df = pd.read_sql('select data_source, year, sum(count) from qa_reporting.dw_truv_revenue_cd group by 1,2;', con=connection)
rev_cd_df['sum'] = rev_cd_df['sum'].astype(int)
rev_cd_df = rev_cd_df.set_index('year')
rev_cd_df



Unnamed: 0_level_0,data_source,sum
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,trum,261879072
2020,trum,106215773
2016,trum,134338072
2019,trum,112172842
2015,trum,138140504
2022,trum,111544450
2011,trum,266682146
2017,trum,92329014
2021,trum,96103135
2013,trum,237029485


We should have a very low percentage of claim lines that have revenue codes with invalid revenue codes. If the percent if > 1%, need to investigate further

In [27]:
100. * invalid_rev_cd_sum['count'] / rev_cd_df['sum']

year
2011    0.024914
2012    0.026568
2013    0.028955
2014    0.030094
2015    0.031967
2016    0.030174
2017    0.027300
2018    0.022376
2019    0.023292
2020    0.012431
2021    0.018054
2022    0.015927
dtype: float64

## Bill Type Code

Looking at how many claims have invalid bill type codes. This occurs when the claim has a 2 character bill type code in the raw data.

In [28]:
query = '''drop table if exists qa_reporting.dw_truv_bill_cd;
select data_source, year, bill, count(*)
into qa_reporting.dw_truv_bill_cd
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_bill_cd
select data_source, year, bill, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [29]:
query = '''
select a.* 
from qa_reporting.dw_truv_bill_cd a
left join reference_tables.ref_bill_type_cd b
on a.bill = b.bill_type_cd
where b.bill_type_cd is null
and a.bill is not null
;
'''
invalid_bill_cd_df = pd.read_sql(query, con=connection)
invalid_bill_cd_df.sort_values(['year', 'bill'])



Unnamed: 0,data_source,year,bill,count
89,truc,2011,11,512346
90,trum,2011,11,4824
220,trum,2011,12,31
225,truc,2011,12,3
333,trum,2011,13,20393
...,...,...,...,...
435,trum,2022,83,71
67,truc,2022,85,997
68,trum,2022,85,2
578,truc,2022,86,1562


In [30]:
pd.DataFrame(invalid_bill_cd_df['bill'].unique())

Unnamed: 0,0
0,45
1,94
2,34
3,51
4,89
...,...
57,19
58,41
59,44
60,86


In [31]:
invalid_bill_cd_sum = invalid_bill_cd_df.groupby('year').sum()
invalid_bill_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2011,3256738
2012,3236522
2013,3809923
2014,4471311
2015,2744882
2016,216205
2017,219078
2018,218225
2019,1515121
2020,1218325


In [32]:
bill_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_truv_bill_cd group by 1', con=connection)
bill_cd_df['sum'] = bill_cd_df['sum'].astype(int)
bill_cd_df = bill_cd_df.set_index('year')
bill_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,693632055
2020,625020365
2021,663760193
2014,1238422890
2019,714794716
2011,1425172833
2012,1465394131
2022,629400768
2017,702719125
2013,1174715607


We should have a very low percentage of claim lines that have bill type codes with invalid bill type codes. If the percent if > 1%, need to investigate further

In [33]:
100. * invalid_bill_cd_sum['count'] / bill_cd_df['sum']

year
2011    0.228515
2012    0.220864
2013    0.324327
2014    0.361049
2015    0.351712
2016    0.027009
2017    0.031176
2018    0.031461
2019    0.211966
2020    0.194926
2021    0.236924
2022    0.248757
dtype: float64

## CPT HCPCS Code

In [34]:
query = '''drop table if exists qa_reporting.dw_truv_cpt_proc_counts;
select data_source, year, cpt_hcpcs_cd, count(*) as proc_count
into qa_reporting.dw_truv_cpt_proc_counts
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_cpt_proc_counts
select data_source, year, cpt_hcpcs_cd, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [35]:
cpt_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_cpt_proc_counts;', con=connection)
cpt_proc_cd_df



Unnamed: 0,data_source,year,cpt_hcpcs_cd,proc_count
0,trum,2020,A6245,20
1,trum,2022,87181,1153
2,trum,2016,S0621,1096
3,trum,2017,36906,554
4,trum,2012,G8998,1
...,...,...,...,...
349207,truc,2014,87511,91355
349208,truc,2016,27479,161
349209,truc,2016,43320,10
349210,truc,2015,81319,4064


Currently it may be hard to determine how many invalid CPT and HCPCS there in the data due to the list of codes we have for as a reference. There are two reference tables that can be used, **reference_tables.mrconso_cpt_hcpcs_hcpt** and **reference_tables.cpt_hcpc**

In [36]:
query = '''
select a.*
from qa_reporting.dw_truv_cpt_proc_counts a
left join (select distinct code from reference_tables.mrconso_cpt_hcpcs_hcpt) b
on a.cpt_hcpcs_cd = b.code
where b.code is null
;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,data_source,year,cpt_hcpcs_cd,proc_count
0,trum,2014,G8530,223
1,truc,2013,72069,84423
2,truc,2019,96151,11860
3,trum,2014,82486,1021
4,truc,2012,72069,102507
...,...,...,...,...
35274,truc,2018,86849,2933
35275,trum,2012,90813,111
35276,trum,2017,D0272,4
35277,truc,2011,90646,5722


In [37]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2011    265335289
2012    275497380
2013    201180509
2014    197600323
2015    120429170
2016    112858495
2017     85079522
2018     68188422
2019     65933239
2020     53026797
2021     52511770
2022     50064450
Name: proc_count, dtype: int64

In [38]:
proc_comp_df = pd.DataFrame({'overall_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,1425172833,265335289,1159837544,22.876936
2012,1465394131,275497380,1189896751,23.153049
2013,1174715607,201180509,973535098,20.664947
2014,1238422890,197600323,1040822567,18.985015
2015,780433995,120429170,660004825,18.246711
2016,800493964,112858495,687635469,16.412547
2017,702719125,85079522,617639603,13.774946
2018,693632055,68188422,625443633,10.902409
2019,714794716,65933239,648861477,10.161374
2020,625020365,53026797,571993568,9.270523


## Discharge Status

Checking if there are invalid discharge status codes.

Ignore lines with a NULL discharge status code. Most lines have no discharge status possibly due to raw data not providing this or other reasons.

In [39]:
query = '''drop table if exists qa_reporting.dw_truv_discharge_counts;
select data_source, year, discharge_status, count(*) as proc_count
into qa_reporting.dw_truv_discharge_counts
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_discharge_counts
select data_source, year, discharge_status, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [40]:
dschrg_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_discharge_counts;', con=connection)
dschrg_cd_df



Unnamed: 0,data_source,year,discharge_status,proc_count
0,trum,2014,51,347989
1,trum,2013,02,1545837
2,trum,2018,04,28958
3,truc,2011,21,234
4,truc,2011,06,7861147
...,...,...,...,...
906,trum,2015,99,277
907,truc,2020,65,90284
908,truc,2016,50,138727
909,truc,2020,07,166079


In [41]:
dschrg_cd_df['discharge_status'].unique()

array(['51', '02', '04', '21', '06', '01', '41', '40', '07', '89', '90',
       '03', '63', None, '88', '70', '98', '69', '42', '91', '95', '30',
       '54', '64', '85', '09', '86', '61', '66', '05', '62', '43', '84',
       '50', '99', '08', '94', '83', '72', '20', '82', '65', '71', '92',
       '93', '87', '81', '60', '53', '80'], dtype=object)

In [42]:
dschrg_cd_df[dschrg_cd_df['discharge_status'].isna()]

Unnamed: 0,data_source,year,discharge_status,proc_count
14,trum,2013,,208654178
26,truc,2014,,954650243
29,trum,2016,,119918298
52,truc,2020,,487623693
66,trum,2017,,82549210
83,trum,2021,,86838085
149,trum,2012,,230965831
210,truc,2021,,536963082
238,truc,2015,,601982048
309,trum,2018,,59793029


In [43]:
dschrg_cd_df = dschrg_cd_df[~dschrg_cd_df['discharge_status'].isna()]
dschrg_cd_df

Unnamed: 0,data_source,year,discharge_status,proc_count
0,trum,2014,51,347989
1,trum,2013,02,1545837
2,trum,2018,04,28958
3,truc,2011,21,234
4,truc,2011,06,7861147
...,...,...,...,...
906,trum,2015,99,277
907,truc,2020,65,90284
908,truc,2016,50,138727
909,truc,2020,07,166079


In [44]:
query = '''
select a.*
from qa_reporting.dw_truv_discharge_counts a
left join reference_tables.ref_discharge_status b
on a.discharge_status = b.discharge_status
where b.discharge_status is null
and a.discharge_status is not null
;
'''

invalid_dschrg_df = pd.read_sql(query, con=connection)
invalid_dschrg_df



Unnamed: 0,data_source,year,discharge_status,proc_count
0,trum,2020,54,330
1,truc,2016,54,58
2,trum,2022,54,18
3,trum,2020,99,263
4,truc,2015,99,4313
5,trum,2014,99,71
6,truc,2017,98,261
7,trum,2016,99,612
8,trum,2018,54,184
9,truc,2016,98,238


In [45]:
dschrg_comp_df = pd.DataFrame({'overall_proc_count': dschrg_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_dschrg_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': dschrg_cd_df.groupby('year')['proc_count'].sum() - invalid_dschrg_df.groupby('year')['proc_count'].sum()})
dschrg_comp_df.loc[dschrg_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = dschrg_comp_df.loc[dschrg_comp_df['valid_proc_count'].isna(),'overall_proc_count']
dschrg_comp_df['valid_proc_count'] =  dschrg_comp_df['valid_proc_count'].astype(int)
dschrg_comp_df['invalid_to_valid_percent'] = 100. * dschrg_comp_df['invalid_proc_count'] / dschrg_comp_df['valid_proc_count']
dschrg_comp_df 

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,109127326,,109127326,
2012,108887838,,108887838,
2013,87417375,385.0,87416990,0.00044
2014,89561039,2114.0,89558925,0.00236
2015,56667138,4726.0,56662412,0.008341
2016,53974578,7954.0,53966624,0.014739
2017,43912118,8939.0,43903179,0.020361
2018,41459120,1875.0,41457245,0.004523
2019,44785770,2631.0,44783139,0.005875
2020,42358572,3986.0,42354586,0.009411


## DRG CD

Checking DRG values in this column. There is no reference table to match the DRG values used in Truven so cannot validate DRG codes in data.

In [46]:
query = '''drop table if exists qa_reporting.dw_truv_drg_counts;
select data_source, year, drg_cd, count(*) as drg_count
into qa_reporting.dw_truv_drg_counts
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_drg_counts
select data_source, year, drg_cd, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [47]:
drg_df = pd.read_sql('select * from qa_reporting.dw_truv_drg_counts;', con=connection)
drg_df



Unnamed: 0,data_source,year,drg_cd,drg_count
0,trum,2019,246,103870
1,trum,2018,016,3754
2,trum,2017,134,858
3,trum,2012,715,3438
4,trum,2017,639,4445
...,...,...,...,...
17862,truc,2021,300,28260
17863,truc,2021,492,35267
17864,truc,2016,691,4430
17865,truc,2018,187,10948


In [48]:
drg_df['drg_cd'].unique()

array(['246', '016', '134', '715', '639', '545', '011', '261', '240',
       '012', '907', '226', '178', '219', '748', '559', '406', '599',
       '485', '052', '596', '218', '094', '624', '419', '385', '551',
       '139', '555', '963', '183', '540', '759', '374', '208', '148',
       '181', '637', '603', '082', '674', '904', '941', '098', '981',
       '408', '594', '356', '331', '846', '307', '328', '165', '691',
       '146', '538', '688', '289', '026', '168', '685', '712', '509',
       '215', '836', '206', '371', '373', '158', '294', '130', '338',
       '573', '268', '415', '102', '777', '667', '543', '987', '840',
       '417', '088', '196', '617', '570', '880', '455', '922', '242',
       '620', '837', '071', '821', '418', '330', '332', '337', '235',
       '499', '949', '375', '100', '489', '167', '717', '737', '670',
       '867', '164', '827', '756', '442', '622', '835', '728', '251',
       '697', '826', '117', '407', '258', '510', '507', '083', '203',
       '729', '592',

In [49]:
drg_df[drg_df['drg_cd'].isna()]

Unnamed: 0,data_source,year,drg_cd,drg_count
621,trum,2014,,193748890
1026,trum,2016,,118914745
1662,truc,2014,,951444085
1952,truc,2011,,1076951968
2073,truc,2022,,490666698
2522,trum,2019,,99558758
3214,trum,2015,,121431610
4318,truc,2012,,1121364740
5303,truc,2013,,875128399
5663,truc,2018,,588543958


In [50]:
drg_df

Unnamed: 0,data_source,year,drg_cd,drg_count
0,trum,2019,246,103870
1,trum,2018,016,3754
2,trum,2017,134,858
3,trum,2012,715,3438
4,trum,2017,639,4445
...,...,...,...,...
17862,truc,2021,300,28260
17863,truc,2021,492,35267
17864,truc,2016,691,4430
17865,truc,2018,187,10948


In [51]:
comp = pd.merge(left=drg_df, right=df.loc[:, ['data_source', 'calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']].groupby(['data_source', 'calendar_year']).sum(),
                left_on=['year', 'data_source'], right_on=['calendar_year', 'data_source'], how='outer')
comp = comp[['data_source', 'year', 'drg_cd', 'drg_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('drg_count')

Unnamed: 0,data_source,year,drg_cd,drg_count,dw_row_count,dw_uth_clm_id_count
17249,truc,2015,295,1,642293491,266301958
2838,trum,2012,970,1,261879072,109437188
6230,trum,2014,998,2,219788855,87465269
2932,trum,2020,768,2,106215773,44287295
14615,truc,2017,218,2,610390111,255843536
...,...,...,...,...,...,...
12134,truc,2016,,622737637,666155892,275643259
15831,truc,2013,,875128399,937686122,391186544
12636,truc,2014,,951444085,1018634035,417694948
8863,truc,2011,,1076951968,1158490687,487954165


Many claims do not have DRG codes meaning that there will be a low percentage of drg to claim ratios. 

In [52]:
drg_sum_df = comp[~comp['drg_cd'].isna()].groupby(['data_source', 'year', 'dw_uth_clm_id_count'])['drg_count'].sum()
drg_sum_df = drg_sum_df.reset_index()
drg_sum_df['type_to_id'] = 1. * (drg_sum_df['drg_count'] / drg_sum_df['dw_uth_clm_id_count'])
drg_sum_df

Unnamed: 0,data_source,year,dw_uth_clm_id_count,drg_count,type_to_id
0,truc,2011,487954165,81538719,0.167103
1,truc,2012,497834304,82150319,0.165015
2,truc,2013,391186544,62557723,0.159918
3,truc,2014,417694948,67189950,0.160859
4,truc,2015,266301958,43136781,0.161984
5,truc,2016,275643259,43418255,0.157516
6,truc,2017,255843536,38079312,0.148838
7,truc,2018,262330769,38913462,0.148337
8,truc,2019,252953461,36992621,0.146243
9,truc,2020,227363880,33708705,0.148259


## Provider Type

In [53]:
query = '''drop table if exists qa_reporting.dw_truv_provider_type_counts;
select data_source, year, provider_type, count(*) as type_count
into qa_reporting.dw_truv_provider_type_counts
from dw_staging.trum_claim_detail
group by 1,2,3
;

insert into qa_reporting.dw_truv_provider_type_counts
select data_source, year, provider_type, count(*)
from dw_staging.truc_claim_detail
group by 1,2,3
;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [54]:
pt_df = pd.read_sql('select * from qa_reporting.dw_truv_provider_type_counts;', con=connection)
pt_df



Unnamed: 0,data_source,year,provider_type,type_count
0,trum,2022,930,9728279
1,trum,2020,822,669110
2,trum,2014,225,163493
3,trum,2019,575,3722
4,trum,2019,820,3077
...,...,...,...,...
3169,trum,2020,560,435
3170,truc,2021,820,480435
3171,truc,2014,265,331214
3172,truc,2016,32,230


In [55]:
pt_df[pt_df['provider_type'] == '1']

Unnamed: 0,data_source,year,provider_type,type_count
164,truc,2022,1,102728341
188,truc,2016,1,149008992
296,truc,2021,1,113575975
346,trum,2022,1,28262352
364,trum,2020,1,25221099
462,truc,2018,1,129077741
603,truc,2012,1,234530670
666,trum,2018,1,17472924
730,truc,2017,1,131343958
858,truc,2019,1,126406246


All the provider type values in DW matches with the values Truven uses. There are several rows with no provider type.

In [56]:
query = '''
select a.*
from qa_reporting.dw_truv_provider_type_counts a
left join reference_tables.truven_prov_specialty_cds b
on a.provider_type = b.specialty_cd
where b.specialty_cd is null
;
'''

invalid_pt_df = pd.read_sql(query, con=connection)
invalid_pt_df



Unnamed: 0,data_source,year,provider_type,type_count
0,truc,2022,,11542188
1,trum,2013,,5880792
2,trum,2018,,2705661
3,truc,2013,,17509566
4,trum,2019,,2369244
5,trum,2020,,1901654
6,trum,2012,,12344056
7,trum,2017,,1108654
8,truc,2014,,15728059
9,truc,2019,,13700338


In [57]:
comp = pd.merge(left=pt_df, right=df.loc[:, ['data_source', 'calendar_year', 'dw_row_count', 'dw_uth_clm_id_count']].groupby(['data_source', 'calendar_year']).sum(),
                left_on=['year', 'data_source'], right_on=['calendar_year', 'data_source'], how='outer')
comp = comp[['data_source', 'year', 'provider_type', 'type_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('type_count')

Unnamed: 0,data_source,year,provider_type,type_count,dw_row_count,dw_uth_clm_id_count
189,trum,2020,870,1,106215773,44287295
1321,trum,2013,41,1,237029485,96731508
1691,trum,2016,145,1,134338072,53675228
21,trum,2022,32,1,111544450,43134537
366,trum,2014,41,2,219788855,87465269
...,...,...,...,...,...,...
2383,truc,2016,1,149008992,666155892,275643259
616,truc,2013,1,201064925,937686122,391186544
2616,truc,2011,1,221276679,1158490687,487954165
2958,truc,2014,1,230622362,1018634035,417694948


Provider type is on the claim line level. Ideally we should have a provider type for almost all the claims.

In [58]:
pt_sum_df = comp[~comp['provider_type'].isna()].groupby(['data_source', 'year', 'dw_row_count'])['type_count'].sum()
pt_sum_df = pt_sum_df.reset_index()
pt_sum_df['type_to_id'] = 1. * (pt_sum_df['type_count'] / pt_sum_df['dw_row_count'])
pt_sum_df

Unnamed: 0,data_source,year,dw_row_count,type_count,type_to_id
0,truc,2011,1158490687,1085553958,0.937042
1,truc,2012,1203515059,1129871202,0.938809
2,truc,2013,937686122,920176556,0.981327
3,truc,2014,1018634035,1002905976,0.98456
4,truc,2015,642293491,628117434,0.977929
5,truc,2016,666155892,651388584,0.977832
6,truc,2017,610390111,595380178,0.975409
7,truc,2018,627457420,609974436,0.972137
8,truc,2019,602621874,588921536,0.977265
9,truc,2020,518804592,508465339,0.980071


In [59]:
pt_sum_df[pt_sum_df['type_to_id']  > 5]

Unnamed: 0,data_source,year,dw_row_count,type_count,type_to_id
