# Data Warehouse Truven QA - Claim Detail

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [119]:
connection = psycopg2.connect(get_dsn())
connection.autocommit = True

In [3]:
year_df = pd.read_sql('select distinct year from dw_staging.claim_header_1_prt_truv;', con=connection)
tables = ['ccaes', 'mdcrs', 'mdcro', 'ccaeo']



## Row Count and Claim Count

Similar to the member_enrollment_monthly table, the claim_header table has the row count should equal the number of unique claims.

For this table, we extract claim data from the s, o, and f tables from the truven schema. 


In [4]:
query = ''' drop table if exists qa_reporting.dw_truv_claim_detail_counts;
create table qa_reporting.dw_truv_claim_detail_counts
(
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    for year in tqdm(year_df['year']):
        query = f'''
        insert into qa_reporting.dw_truv_claim_detail_counts
        (calendar_year, table_src, dw_row_count, date_generated)
        select year, table_id_src, count(*), current_date
        from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1, 2
        '''
        
        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_clm_id_count = count
        from (
            select year, table_id_src,  count(distinct uth_claim_id) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1, 2 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_clm_id_count = count
        from (
            select year, table_id_src,  count(distinct claim_id_src) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1, 2 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_mbr_id_count = count
        from (
            select year, table_id_src,  count(distinct uth_member_id) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1, 2 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)
        
        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_mbr_id_count = count
        from (
            select year, table_id_src,   count(distinct member_id_src) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1, 2 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        insert into qa_reporting.dw_truv_claim_detail_counts
        (calendar_year, table_src, dw_row_count, date_generated)
        select year, 'ALL' table_id_src, count(*), current_date
        from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1
        '''
        
        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_clm_id_count = count
        from (
            select year, 'ALL' table_id_src,  count(distinct uth_claim_id) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_clm_id_count = count
        from (
            select year, 'ALL' table_id_src,  count(distinct claim_id_src) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_uth_mbr_id_count = count
        from (
            select year, 'ALL' table_id_src,  count(distinct uth_member_id) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)
        
        query = f'''
        update qa_reporting.dw_truv_claim_detail_counts b
        set dw_src_mbr_id_count = count
        from (
            select year, 'ALL' table_id_src,   count(distinct member_id_src) as count 
            from dw_staging.claim_detail_1_prt_truv
        where year = {year}
        group by 1 ) a
        where a.year = b.calendar_year
        and a.table_id_src = b.table_src
        '''

        cursor.execute(query)

100%|██████████| 12/12 [4:31:43<00:00, 1358.66s/it] 


In [8]:
with connection.cursor() as cursor:
    query = '''update qa_reporting.dw_truv_claim_detail_counts a
    set src_row_count = b.row_count,
    row_count_diff = dw_row_count - b.row_count,
    row_count_diff_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
    src_clm_count = clm_count,
    clm_count_diff = dw_uth_clm_id_count - b.clm_count,
    clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
    src_mbr_count = b.pat_count,
    mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
    mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from qa_reporting.truven_counts b
    where a.calendar_year = b.year
    and a.table_src = b.table_name
    ;
    '''

    cursor.execute(query)

In [10]:
# Same updates as before, but with the combine counts of all the raw tables used
# Using the counts found during the claim header qa to avoid having to redo the counts.
with connection.cursor() as cursor:
    query = '''update qa_reporting.dw_truv_claim_detail_counts a
    set src_clm_count = b.src_clm_count,
    clm_count_diff = a.dw_uth_clm_id_count - b.src_clm_count,
    clm_count_percentage = 100. * abs(a.dw_uth_clm_id_count - b.src_clm_count) / b.src_clm_count,
    src_mbr_count = b.src_mbr_count,
    mbr_count_diff = a.dw_uth_mbr_id_count - b.src_mbr_count,
    mbr_count_percentage = 100. * abs(a.dw_uth_mbr_id_count - b.src_mbr_count) / b.src_mbr_count
    from qa_reporting.dw_truv_claim_header_counts b
    where a.calendar_year = b.calendar_year
    and a.table_src = b.table_src
    and a.table_src = 'ALL'
    ;
    '''

    cursor.execute(query)

In [25]:
with connection.cursor() as cursor:
    query = '''update qa_reporting.dw_truv_claim_detail_counts a
    set src_row_count = b.row_count,
    row_count_diff = a.dw_row_count - b.row_count,
    row_count_diff_percentage = 100. * abs(a.dw_row_count - b.row_count) / b.row_count
    from (select year, sum(row_count) as row_count from qa_reporting.truven_counts where table_name in ('ccaeo', 'ccaes', 'mdcrs', 'mdcro') group by 1) b
    where a.calendar_year = b.year
    and a.table_src = 'ALL'
    ;
    '''

    cursor.execute(query)

In [95]:
query = '''select * from qa_reporting.dw_truv_claim_detail_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values('calendar_year')



Unnamed: 0,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
28,2011,ccaes,81577144,81640170,-63026,0.0772,22345904,22342714,22342714,3190,0.014278,2348969,2348969,2348969,0,0.0,2023-05-04
26,2011,mdcrs,32537662,32535953,1709,0.005253,10838220,10836750,10836750,1470,0.013565,794571,794571,794571,0,0.0,2023-05-04
25,2011,ccaeo,1077221870,1077661934,-440064,0.040835,465727873,465722367,465722367,5506,0.001182,42511345,42511345,42511345,0,0.0,2023-05-04
33,2011,ALL,1425578087,1426093104,-515017,0.036114,604833166,604820682,604820682,12484,0.002064,47089265,47089265,47089265,0,0.0,2023-05-04
27,2011,mdcro,234241411,234255047,-13636,0.005821,106579628,106577308,106577308,2320,0.002177,4750926,4750926,4750926,0,0.0,2023-05-04
30,2012,mdcro,229757995,229768218,-10223,0.004449,99453355,99450749,99450749,2606,0.00262,4391031,4391031,4391031,0,0.0,2023-05-04
31,2012,ccaes,82169453,82224989,-55536,0.067542,21974887,21970624,21970624,4263,0.019403,2289517,2289517,2289517,0,0.0,2023-05-04
32,2012,mdcrs,32202828,32200251,2577,0.008003,10016463,10014407,10014407,2056,0.02053,714226,714226,714226,0,0.0,2023-05-04
29,2012,ccaeo,1121563671,1121961205,-397534,0.035432,475942777,475937265,475937265,5512,0.001158,43288123,43288123,43288123,0,0.0,2023-05-04
34,2012,ALL,1465693947,1466154663,-460716,0.031423,606769408,606755083,606755083,14325,0.002361,47495022,47495022,47495022,0,0.0,2023-05-04


In [27]:
df.sort_values('row_count_diff_percentage', ascending=False)[['calendar_year', 'table_src', 'row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage']]

Unnamed: 0,calendar_year,table_src,row_count_diff_percentage,clm_count_percentage,mbr_count_percentage
37,2022,ccaes,0.680754,0.559469,1.254483
43,2017,ccaes,0.158891,0.042288,0.0
2,2018,ccaes,0.147434,0.041069,0.0
57,2015,ccaes,0.14409,0.027951,0.0
52,2016,ccaes,0.140245,0.032617,0.0
39,2022,ALL,0.138899,0.037966,0.062709
5,2020,ccaes,0.131309,0.041451,0.0
36,2022,ccaeo,0.130084,0.025308,0.059356
20,2019,ccaes,0.125586,0.038979,0.000106
12,2021,ccaes,0.100353,0.004238,0.06615


## Place of Service

In [51]:
query = '''drop table if exists qa_reporting.dw_truv_pos;
select year, place_of_service, count(*)
into qa_reporting.dw_truv_pos
from dw_staging.claim_detail_1_prt_truv
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [52]:
pos_df = pd.read_sql('select * from qa_reporting.dw_truv_pos;', con=connection)
pos_df



Unnamed: 0,year,place_of_service,count
0,2017,18,13227
1,2019,55,254926
2,2022,72,130936
3,2020,,2965
4,2018,50,48129
...,...,...,...
535,2021,21,41577812
536,2012,4.,144
537,2019,35,1
538,2011,32,1294874


In [53]:
pos_df['place_of_service'].unique()

array(['18', '55', '72', None, '50', '1', '61', '56', '23', '26', '1.',
       '99', '62', '49', '4', '12', '54', '21', '24', '31', '53', '20',
       '35', '3', '41', '60', '52', '51', '14', '33', '11', '32', '95',
       '15', '81', '3.', '13', '42', '16', '34', '19', '17', '57', '65',
       '25', '22', '2.', '2', '71', '58', '4.', '28', '27', '10'],
      dtype=object)

In [58]:
query = '''
select a.* 
from qa_reporting.dw_truv_pos a
left join reference_tables.ref_place_of_service b
on a.place_of_service = b.place_of_treatment_cd
where b.place_of_treatment_cd is null
;
'''
pos_df = pd.read_sql(query, con=connection)
pos_df.sort_values(['year', 'place_of_service'])



Unnamed: 0,year,place_of_service,count
8,2011,1.0,182194
54,2011,3.0,1137
26,2011,4.0,374
16,2011,,112
10,2012,1.0,194074
48,2012,3.0,673
24,2012,4.0,144
20,2012,,48
6,2013,1.0,168026
52,2013,3.0,2127


In [28]:
pd.read_sql('select distinct stdplac from truven.ccaes;', con=connection)



Unnamed: 0,stdplac
0,54.0
1,
2,99.0
3,28.0
4,52.0
5,1.0
6,13.0
7,12.0
8,23.0
9,19.0


In [29]:
pd.read_sql('select distinct stdplac from truven.ccaeo;', con=connection)



Unnamed: 0,stdplac
0,99.0
1,28.0
2,
3,23.0
4,54.0
5,21.0
6,51.0
7,52.0
8,1.0
9,49.0


## Revenue Code

Looking at how many claims have invalid revenue codes.

In [3]:
query = '''drop table if exists qa_reporting.dw_truv_revenue_cd;
select year, revenue_cd, count(*)
into qa_reporting.dw_truv_revenue_cd
from dw_staging.claim_detail_1_prt_truv
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [34]:
query = '''
select a.* 
from qa_reporting.dw_truv_revenue_cd a
left join reference_tables.ref_revenue_code b
on a.revenue_cd = b.revenue_cd
where b.revenue_cd is null
and a.revenue_cd is not null
;
'''
invalid_rev_cd_df = pd.read_sql(query, con=connection)
invalid_rev_cd_df.sort_values(['year', 'revenue_cd'])



Unnamed: 0,year,revenue_cd,count
1043,2011,0002,541
3745,2011,0003,13
3426,2011,0004,4
239,2011,0005,4
1735,2011,0006,5
...,...,...,...
3940,2022,9212,1
1185,2022,9213,2
3466,2022,9395,1
579,2022,9900,1


In [35]:
pd.DataFrame(invalid_rev_cd_df['revenue_cd'].unique())

Unnamed: 0,0
0,0357
1,0838
2,3152
3,7075
4,0594
...,...
1257,1070
1258,0205
1259,0967
1260,2304


In [36]:
invalid_rev_cd_sum = invalid_rev_cd_df.groupby('year').sum()
invalid_rev_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2011,251530
2012,258077
2013,435977
2014,199407
2015,132617
2016,120492
2017,97868
2018,108798
2019,95565
2020,50247


In [43]:
rev_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_truv_revenue_cd group by 1', con=connection)
rev_cd_df['sum'] = rev_cd_df['sum'].astype(int)
rev_cd_df = rev_cd_df.set_index('year')
rev_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,693694537
2021,663777531
2020,625052238
2014,1238867926
2019,714848531
2022,306090998
2017,703262558
2013,1175053414
2016,801046374
2011,1425578087


We should have a very low percentage of claim lines that have revenue codes with invalid revenue codes. If the percent if > 1%, need to investigate further

In [44]:
100. * invalid_rev_cd_sum['count'] / rev_cd_df['sum']

year
2011    0.017644
2012    0.017608
2013    0.037103
2014    0.016096
2015    0.016986
2016    0.015042
2017    0.013916
2018    0.015684
2019    0.013369
2020    0.008039
2021    0.008870
2022    0.008945
dtype: float64

## Bill Type Code

Looking at how many claims have invalid bill type codes. This occurs when the claim has a 2 character bill type code in the raw data.

In [45]:
query = '''drop table if exists qa_reporting.dw_truv_revenue_cd;
select year, bill_type_inst || bill_type_class || bill_type_freq as bill_cd, count(*)
into qa_reporting.dw_truv_bill_cd
from dw_staging.claim_detail_1_prt_truv
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [47]:
query = '''
select a.* 
from qa_reporting.dw_truv_bill_cd a
left join reference_tables.ref_bill_type_cd b
on a.bill_cd = b.bill_type_cd
where b.bill_type_cd is null
and a.bill_cd is not null
;
'''
invalid_bill_cd_df = pd.read_sql(query, con=connection)
invalid_bill_cd_df.sort_values(['year', 'bill_cd'])



Unnamed: 0,year,bill_cd,count
266,2011,11,517170
423,2011,12,34
24,2011,13,2483687
388,2011,14,66199
197,2011,15,12
...,...,...,...
125,2022,82,33
30,2022,83,950
187,2022,85,413
378,2022,86,925


In [49]:
pd.DataFrame(invalid_bill_cd_df['bill_cd'].unique())

Unnamed: 0,0
0,22
1,97
2,55
3,13
4,83
...,...
57,14
58,42
59,31
60,35


In [53]:
invalid_bill_cd_sum = invalid_bill_cd_df.groupby('year').sum()
invalid_bill_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2011,3256906
2012,3236549
2013,3809966
2014,4471965
2015,2745300
2016,216253
2017,219106
2018,218216
2019,1515123
2020,1218325


In [54]:
bill_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_truv_bill_cd group by 1', con=connection)
bill_cd_df['sum'] = bill_cd_df['sum'].astype(int)
bill_cd_df = bill_cd_df.set_index('year')
bill_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,693694537
2020,625052238
2021,663777531
2019,714848531
2014,1238867926
2011,1425578087
2012,1465693947
2022,306090998
2013,1175053414
2017,703262558


We should have a very low percentage of claim lines that have bill type codes with invalid bill type codes. If the percent if > 1%, need to investigate further

In [55]:
100. * invalid_bill_cd_sum['count'] / bill_cd_df['sum']

year
2011    0.228462
2012    0.220820
2013    0.324238
2014    0.360972
2015    0.351624
2016    0.026996
2017    0.031156
2018    0.031457
2019    0.211950
2020    0.194916
2021    0.236921
2022    0.266641
dtype: float64

## CPT HCPCS Code

In [57]:
query = '''drop table if exists qa_reporting.dw_truv_cpt_proc_counts;
select year, cpt_hcpcs_cd, count(*) as proc_count
into qa_reporting.dw_truv_cpt_proc_counts
from dw_staging.claim_detail_1_prt_truv
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [58]:
cpt_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_cpt_proc_counts;', con=connection)
cpt_proc_cd_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2016,L3219,250
1,2015,L0976,106
2,2016,31632,1383
3,2021,S5012,84
4,2020,90956,6
...,...,...,...
192352,2016,33501,1
192353,2015,21452,6
192354,2022,J7042,6895
192355,2011,J0780,60355


Currently it may be hard to determine how many invalid CPT and HCPCS there in the data due to the list of codes we have for as a reference. There are two reference tables that can be used, **reference_tables.mrconso_cpt_hcpcs_hcpt** and **reference_tables.cpt_hcpc**

In [74]:
query = '''
select a.*
from qa_reporting.dw_truv_cpt_proc_counts a
left join (select distinct code from reference_tables.mrconso_cpt_hcpcs_hcpt) b
on a.cpt_hcpcs_cd = b.code
where b.code is null
;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2021,J0248,27
1,2014,83904,15
2,2020,47610,75
3,2021,95857,8
4,2017,93540,1
...,...,...,...
23554,2015,99148,1838
23555,2013,Q9947,2
23556,2013,A6422,1
23557,2014,G0345,1


In [75]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2011    265417927
2012    275550990
2013    201237534
2014    197674000
2015    120478064
2016    112930837
2017     85137888
2018     68200573
2019     65943183
2020     53034359
2021     52513524
2022     23675765
Name: proc_count, dtype: int64

In [76]:
proc_comp_df = pd.DataFrame({'overall_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,1425578087,265417927,1160160160,22.877697
2012,1465693947,275550990,1190142957,23.152764
2013,1175053414,201237534,973815880,20.664844
2014,1238867926,197674000,1041193926,18.98532
2015,780749527,120478064,660271463,18.246747
2016,801046374,112930837,688115537,16.41161
2017,703262558,85137888,618124670,13.773579
2018,693694537,68200573,625493964,10.903474
2019,714848531,65943183,648905348,10.162219
2020,625052238,53034359,572017879,9.271451


## Discharge Status

Checking if there are invalid discharge status codes.

Ignore lines with a NULL discharge status code. Most lines have no discharge status possibly due to raw data not providing this or other reasons.

In [77]:
query = '''drop table if exists qa_reporting.dw_truv_discharge_counts;
select year, discharge_status, count(*) as proc_count
into qa_reporting.dw_truv_discharge_counts
from dw_staging.claim_detail_1_prt_truv
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [78]:
dschrg_cd_df = pd.read_sql('select * from qa_reporting.dw_truv_discharge_counts;', con=connection)
dschrg_cd_df



Unnamed: 0,year,discharge_status,proc_count
0,2021,63,506154
1,2013,72,8
2,2017,69,3959
3,2015,63,599422
4,2017,51,287035
...,...,...,...
467,2016,84,45
468,2013,43,7744
469,2013,92,122
470,2021,64,10863


In [87]:
dschrg_cd_df['discharge_status'].unique()

array(['63', '72', '69', '51', '66', '84', '93', '91', '92', '89', '88',
       '85', '03', '94', None, '95', '98', '71', '99', '70', '81', '08',
       '54', '62', '90', '64', '40', '01', '02', '06', '43', '50', '82',
       '83', '04', '20', '05', '30', '60', '09', '42', '61', '65', '86',
       '53', '07', '21', '87', '41', '80'], dtype=object)

In [92]:
dschrg_cd_df[dschrg_cd_df['discharge_status'].isna()]

Unnamed: 0,year,discharge_status,proc_count
15,2021,,623817267
30,2020,,582685952
188,2013,,1087607225
202,2019,,670053889
229,2011,,1316400952
234,2017,,659309668
390,2016,,747028442
391,2018,,652222018
396,2012,,1356781233
405,2014,,1149270587


In [93]:
dschrg_cd_df = dschrg_cd_df[~dschrg_cd_df['discharge_status'].isna()]
dschrg_cd_df

Unnamed: 0,year,discharge_status,proc_count
0,2021,63,506154
1,2013,72,8
2,2017,69,3959
3,2015,63,599422
4,2017,51,287035
...,...,...,...
467,2016,84,45
468,2013,43,7744
469,2013,92,122
470,2021,64,10863


In [84]:
query = '''
select a.*
from qa_reporting.dw_truv_discharge_counts a
left join reference_tables.ref_discharge_status b
on a.discharge_status = b.discharge_status
where b.discharge_status is null
and a.discharge_status is not null
;
'''

invalid_dschrg_df = pd.read_sql(query, con=connection)
invalid_dschrg_df



Unnamed: 0,year,discharge_status,proc_count
0,2020,54,354
1,2020,99,3071
2,2013,98,44
3,2022,98,662
4,2017,60,198
5,2016,54,196
6,2021,98,217
7,2019,99,2103
8,2018,54,203
9,2022,54,18


In [94]:
dschrg_comp_df = pd.DataFrame({'overall_proc_count': dschrg_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_dschrg_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': dschrg_cd_df.groupby('year')['proc_count'].sum() - invalid_dschrg_df.groupby('year')['proc_count'].sum()})
dschrg_comp_df.loc[dschrg_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = dschrg_comp_df.loc[dschrg_comp_df['valid_proc_count'].isna(),'overall_proc_count']
dschrg_comp_df['valid_proc_count'] =  dschrg_comp_df['valid_proc_count'].astype(int)
dschrg_comp_df['invalid_to_valid_percent'] = 100. * dschrg_comp_df['invalid_proc_count'] / dschrg_comp_df['valid_proc_count']
dschrg_comp_df 

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,109177135,,109177135,
2012,108912714,,108912714,
2013,87446189,385.0,87445804,0.00044
2014,89597339,2114.0,89595225,0.00236
2015,56686690,4726.0,56681964,0.008338
2016,54017932,7956.0,54009976,0.014731
2017,43952890,8939.0,43943951,0.020342
2018,41472519,1875.0,41470644,0.004521
2019,44794642,2631.0,44792011,0.005874
2020,42366286,3986.0,42362300,0.009409


## DRG CD

Checking DRG values in this column. There is no reference table to match the DRG values used in Truven so cannot validate DRG codes in data.

In [102]:
query = '''drop table if exists qa_reporting.dw_truv_drg_counts;
select year, drg_cd, count(*) as drg_count
into qa_reporting.dw_truv_drg_counts
from dw_staging.claim_detail_1_prt_truv
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [103]:
drg_df = pd.read_sql('select * from qa_reporting.dw_truv_drg_counts;', con=connection)
drg_df



Unnamed: 0,year,drg_cd,drg_count
0,2015,544,4883
1,2014,137,15705
2,2022,639,18924
3,2015,002,9927
4,2022,665,1195
...,...,...,...
9089,2021,951,23081
9090,2021,220,103269
9091,2015,904,17522
9092,2020,080,4215


In [104]:
drg_df['drg_cd'].unique()

array(['544', '137', '639', '002', '665', '869', '376', '373', '472',
       '989', '168', '976', '562', '724', '542', '596', '512', '130',
       '984', '264', '654', '689', '382', '812', '554', '787', '741',
       '975', '152', '543', '710', '684', '514', '670', '700', '714',
       '464', '406', '784', '423', '998', '756', '292', '616', '267',
       '816', '103', '662', '903', '100', '723', '466', '881', '053',
       '868', '131', '164', '244', '913', '823', '283', '559', '459',
       '437', '537', '653', '673', '445', '604', '056', '538', '418',
       '492', '177', '029', '467', '730', '964', '157', '520', '329',
       '346', '822', '475', '560', '202', '089', '536', '965', '166',
       '432', '226', '503', '955', '290', '054', '651', '117', '289',
       '102', '114', '887', '541', '629', '304', '594', '134', '391',
       '606', '236', '410', '330', '231', '778', '057', '885', '837',
       '235', '158', '354', '800', '828', '096', '770', '495', '179',
       '672', '351',

In [105]:
drg_df[drg_df['drg_cd'].isna()]

Unnamed: 0,year,drg_cd,drg_count
871,2018,,647771642
2206,2016,,742158396
2442,2014,,1145600277
5197,2013,,1083422641
5374,2020,,579100908
5578,2012,,1351321666
5752,2019,,665230030
6583,2021,,620155209
6739,2015,,720883319
6946,2017,,654587921


In [130]:
comp = pd.merge(left=drg_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'drg_cd', 'drg_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('drg_count')

Unnamed: 0,year,drg_cd,drg_count,dw_row_count,dw_uth_clm_id_count
571,2015,295,17,780749527,321038149
1847,2022,257,22,306090998,132183979
4739,2021,257,23,663777531,288600844
1822,2022,285,26,306090998,132183979
6784,2018,295,29,693694537,290506140
...,...,...,...,...,...
2456,2016,,742158396,801046374,329200336
4209,2013,,1083422641,1175053414,487576370
943,2014,,1145600277,1238867926,504900161
7520,2011,,1311463281,1425578087,604833166


Many claims do not have DRG codes meaning that there will be a low percentage of drg to claim ratios. 

In [133]:
drg_sum_df = comp[~comp['drg_cd'].isna()].groupby(['year', 'dw_uth_clm_id_count'])['drg_count'].sum()
drg_sum_df = drg_sum_df.reset_index()
drg_sum_df['type_to_id'] = 1. * (drg_sum_df['drg_count'] / drg_sum_df['dw_uth_clm_id_count'])
drg_sum_df

Unnamed: 0,year,dw_uth_clm_id_count,drg_count,type_to_id
0,2011,604833166,114114806,0.188672
1,2012,606769408,114372281,0.188494
2,2013,487576370,91630773,0.187931
3,2014,504900161,93267649,0.184725
4,2015,321038149,59866208,0.186477
5,2016,329200336,58887978,0.178882
6,2017,293341147,48674637,0.165932
7,2018,290506140,45922895,0.158079
8,2019,299437662,49618501,0.165706
9,2020,271464403,45951330,0.169272


## Provider Type

In [120]:
query = '''drop table if exists qa_reporting.dw_truv_provider_type_counts;
select year, provider_type, count(*) as type_count
into qa_reporting.dw_truv_provider_type_counts
from dw_staging.claim_detail_1_prt_truv
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [121]:
pt_df = pd.read_sql('select * from qa_reporting.dw_truv_provider_type_counts;', con=connection)
pt_df



Unnamed: 0,year,provider_type,type_count
0,2020,805,2639
1,2022,428,92995
2,2019,295,2415274
3,2016,870,89
4,2017,170,18896378
...,...,...,...
1595,2015,453,27455
1596,2018,845,5263164
1597,2018,460,304
1598,2022,330,2697130


All the provider type values in DW matches with the values Truven uses. There are several rows with no provider type.

In [123]:
query = '''
select a.*
from qa_reporting.dw_truv_provider_type_counts a
left join reference_tables.truven_prov_specialty_cds b
on a.provider_type = b.specialty_cd
where b.specialty_cd is null
;
'''

invalid_pt_df = pd.read_sql(query, con=connection)
invalid_pt_df

Unnamed: 0,year,provider_type,type_count
0,2012,,86002282
1,2011,,84966553
2,2018,,20191387
3,2013,,23395437
4,2019,,16071562
5,2016,,19058817
6,2017,,16137171
7,2022,,6792567
8,2021,,14811473
9,2015,,18670684


In [125]:
comp = pd.merge(left=pt_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'provider_type', 'type_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('type_count')

Unnamed: 0,year,provider_type,type_count,dw_row_count,dw_uth_clm_id_count
1089,2013,460,2,1175053414,487576370
790,2014,460,3,1238867926,504900161
1573,2021,32,11,663777531,288600844
404,2016,910,17,801046374,329200336
171,2022,910,18,306090998,132183979
...,...,...,...,...,...
415,2016,1,187144184,801046374,329200336
1094,2013,1,264301834,1175053414,487576370
978,2011,1,281592282,1425578087,604833166
754,2014,1,291159481,1238867926,504900161


Provider type is on the claim line level. Ideally we should have a provider type for almost all the claims.

In [128]:
pt_sum_df = comp[~comp['provider_type'].isna()].groupby(['year', 'dw_row_count'])['type_count'].sum()
pt_sum_df = pt_sum_df.reset_index()
pt_sum_df['type_to_id'] = 1. * (pt_sum_df['type_count'] / pt_sum_df['dw_row_count'])
pt_sum_df

Unnamed: 0,year,dw_row_count,type_count,type_to_id
0,2011,1425578087,1340611534,0.940399
1,2012,1465693947,1379691665,0.941323
2,2013,1175053414,1151657977,0.98009
3,2014,1238867926,1218307226,0.983404
4,2015,780749527,762078843,0.976086
5,2016,801046374,781987557,0.976208
6,2017,703262558,687125387,0.977054
7,2018,693694537,673503150,0.970893
8,2019,714848531,698776969,0.977518
9,2020,625052238,612810816,0.980415
