# Data Warehouse Medicare Texas QA - Claim Detail

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrt_claim_detail_counts;
create table qa_reporting.dw_mcrt_claim_detail_counts
(
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrt_claim_detail_counts
    (calendar_year, table_src, dw_row_count, date_generated)
    select year, 'ALL' table_id_src, count(*), current_date
    from dw_staging.mcrt_claim_detail
    group by 1
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_detail_counts b
    set dw_uth_clm_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_claim_id) as count 
        from dw_staging.mcrt_claim_detail
    group by 1 ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_detail_counts b
    set dw_src_clm_id_count = clm_count,
        dw_src_mbr_id_count = mbr_count
    from (
        select year, 'ALL' table_id_src,  count(distinct claim_id_src) as clm_count, count(distinct member_id_src) as mbr_count 
        from dw_staging.mcrt_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_detail_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_member_id) as count 
        from dw_staging.mcrt_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

In [None]:
with connection.cursor() as cursor:
    query = '''    
    with clms as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.hha_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.outpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.dme_line_k
        union all
        select extract(year from clm_thru_dt::date)as year, bene_id, clm_id
        from medicare_texas.inpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.bcarrier_line_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.hospice_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.snf_revenue_center_k
    ),
    clm_counts as (
        select year, count(*) row_count, count(distinct bene_id) pat_count, count(distinct clm_id) clm_count
        from clms
        group by 1
    )
    update qa_reporting.dw_mcrt_claim_detail_counts a
    set src_row_count = b.row_count,
        row_count_diff = dw_row_count - b.row_count,
        row_count_diff_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
        src_clm_count = clm_count,
        clm_count_diff = dw_uth_clm_id_count - b.clm_count,
        clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
        src_mbr_count = b.pat_count,
        mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
        mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from clm_counts b
    where a.calendar_year = b.year
    ;
    '''

    cursor.execute(query)

In [None]:
query = '''select * from qa_reporting.dw_mcrt_claim_detail_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values('calendar_year')



Unnamed: 0,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
5,2001,ALL,88,,,,6,6,,,,6,6,,,,2023-10-04
2,2002,ALL,86,,,,7,7,,,,7,7,,,,2023-10-04
4,2003,ALL,3,,,,1,1,,,,1,1,,,,2023-10-04
16,2004,ALL,15,,,,3,3,,,,3,3,,,,2023-10-04
14,2005,ALL,60,,,,6,6,,,,5,5,,,,2023-10-04
9,2006,ALL,24,,,,3,3,,,,3,3,,,,2023-10-04
18,2007,ALL,70,,,,5,5,,,,5,5,,,,2023-10-04
3,2008,ALL,5,,,,2,2,,,,2,2,,,,2023-10-04
7,2009,ALL,60,,,,7,7,,,,7,7,,,,2023-10-04
19,2010,ALL,260,,,,23,23,,,,23,23,,,,2023-10-04


In [7]:
df.sort_values('row_count_diff_percentage', ascending=False)[['calendar_year', 'table_src', 'row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage']]

Unnamed: 0,calendar_year,table_src,row_count_diff_percentage,clm_count_percentage,mbr_count_percentage
1,2020,ALL,0.933097,0.193156,0.049814
8,2019,ALL,0.115624,0.021174,0.016306
12,2017,ALL,0.104585,0.024217,0.071252
0,2018,ALL,0.085102,0.036806,0.042233
15,2016,ALL,0.068969,0.014497,0.030696
17,2015,ALL,0.05837,0.013773,0.031413
6,2014,ALL,0.05145,0.040988,0.032249
2,2002,ALL,,,
3,2008,ALL,,,
4,2003,ALL,,,


## Place of Service

In [8]:
query = '''drop table if exists qa_reporting.dw_mcrt_pos;
select year, place_of_service, count(*)
into qa_reporting.dw_mcrt_pos
from dw_staging.mcrt_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [9]:
pos_df = pd.read_sql('select * from qa_reporting.dw_mcrt_pos;', con=connection)
pos_df



Unnamed: 0,year,place_of_service,count
0,2013,1,291965
1,2006,2,3
2,2012,12,9
3,2016,20,505765
4,2017,50,22009
...,...,...,...
429,2016,34,2163
430,2014,56,1101
431,2016,19,1216313
432,2019,42,15512


In [10]:
pos_df['place_of_service'].unique()

array(['1', '2', '12', '20', '50', '4', '53', '17', '71', '55', '65',
       '72', '19', '32', '51', '13', '52', '87', '7', '81', '06', '05',
       '08', '56', '61', '22', '04', '57', '14', '3', '16', '15', '41',
       '25', '24', '11', '21', '60', '02', '07', '00', '42', '95', '99',
       '09', '91', '03', '54', '49', '31', '23', '01', '8', '62', '33',
       '26', '40', '18', '34', '58'], dtype=object)

In [11]:
query = '''
select a.* 
from qa_reporting.dw_mcrt_pos a
left join reference_tables.ref_place_of_service b
on lpad(a.place_of_service, 2, '0') = b.place_of_treatment_cd
where b.place_of_treatment_cd is null
;
'''
pos_df = pd.read_sql(query, con=connection)
pos_df.sort_values(['year', 'place_of_service'])



Unnamed: 0,year,place_of_service,count
1,2014,0,108
3,2015,0,31
4,2016,0,51
0,2017,0,22
2,2018,0,27
6,2019,0,13
5,2020,0,21


## Revenue Code

Looking at how many claims have invalid revenue codes.

In [12]:
query = '''drop table if exists qa_reporting.dw_mcrt_revenue_cd;
select year, revenue_cd, count(*)
into qa_reporting.dw_mcrt_revenue_cd
from dw_staging.mcrt_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
query = '''
select a.* 
from qa_reporting.dw_mcrt_revenue_cd a
left join reference_tables.ref_revenue_code b
on a.revenue_cd = b.revenue_cd
where b.revenue_cd is null
and a.revenue_cd is not null
;
'''
invalid_rev_cd_df = pd.read_sql(query, con=connection)
invalid_rev_cd_df.sort_values(['year', 'revenue_cd'])



Unnamed: 0,year,revenue_cd,count
29,2013,0004,1
213,2013,0451,5
30,2014,0004,1
188,2014,0006,1
206,2014,0015,1
...,...,...,...
85,2020,0891,26
165,2020,0948,11214
135,2020,3010,1
173,2020,3950,1


In [14]:
pd.DataFrame(invalid_rev_cd_df['revenue_cd'].unique())

Unnamed: 0,0
0,0050
1,0184
2,0445
3,0266
4,0363
...,...
148,0451
149,0036
150,2050
151,9390


In [15]:
invalid_rev_cd_sum = invalid_rev_cd_df.groupby('year').sum()
invalid_rev_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2013,6
2014,19570
2015,21409
2016,26930
2017,26965
2018,28394
2019,28298
2020,11617


In [16]:
rev_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrt_revenue_cd group by 1', con=connection)
rev_cd_df['sum'] = rev_cd_df['sum'].astype(int)
rev_cd_df = rev_cd_df.set_index('year')
rev_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,266343773
2008,5
2003,3
2001,88
2014,273755912
2009,60
2006,24
2020,230157293
2002,86
2019,263484278


We should have a very low percentage of claim lines that have revenue codes with invalid revenue codes. If the percent if > 1%, need to investigate further

In [17]:
100. * invalid_rev_cd_sum['count'] / rev_cd_df['sum']

year
2001         NaN
2002         NaN
2003         NaN
2004         NaN
2005         NaN
2006         NaN
2007         NaN
2008         NaN
2009         NaN
2010         NaN
2011         NaN
2012         NaN
2013    0.000172
2014    0.007149
2015    0.007750
2016    0.009739
2017    0.009801
2018    0.010661
2019    0.010740
2020    0.005047
dtype: float64

## Bill Type Code

Looking at how many claims have invalid bill type codes. This occurs when the claim has a 2 character bill type code in the raw data.

In [18]:
query = '''drop table if exists qa_reporting.dw_mcrt_bill_cd;
select year, bill_type_inst || bill_type_class || bill_type_freq as bill_cd, count(*)
into qa_reporting.dw_mcrt_bill_cd
from dw_staging.mcrt_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [19]:
query = '''
select a.* 
from qa_reporting.dw_mcrt_bill_cd a
left join reference_tables.ref_bill_type_cd b
on a.bill_cd = b.bill_type_cd
where b.bill_type_cd is null
and a.bill_cd is not null
;
'''
invalid_bill_cd_df = pd.read_sql(query, con=connection)
invalid_bill_cd_df.sort_values(['year', 'bill_cd'])



Unnamed: 0,year,bill_cd,count


In [20]:
pd.DataFrame(invalid_bill_cd_df['bill_cd'].unique())

Unnamed: 0,0


In [21]:
invalid_bill_cd_sum = invalid_bill_cd_df.groupby('year').sum()
invalid_bill_cd_sum

In [22]:
bill_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrt_bill_cd group by 1', con=connection)
bill_cd_df['sum'] = bill_cd_df['sum'].astype(int)
bill_cd_df = bill_cd_df.set_index('year')
bill_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,266343773
2020,230157293
2002,86
2008,5
2003,3
2001,88
2014,273755912
2009,60
2019,263484278
2006,24


We should have a very low percentage of claim lines that have bill type codes with invalid bill type codes. If the percent if > 1%, need to investigate further

In [23]:
100. * (invalid_bill_cd_sum['count'] if invalid_bill_cd_sum.shape[0] > 0  else 0) / bill_cd_df['sum']

year
2018    0.0
2020    0.0
2002    0.0
2008    0.0
2003    0.0
2001    0.0
2014    0.0
2009    0.0
2019    0.0
2006    0.0
2011    0.0
2012    0.0
2017    0.0
2013    0.0
2005    0.0
2016    0.0
2004    0.0
2015    0.0
2007    0.0
2010    0.0
Name: sum, dtype: float64

## CPT HCPCS Code

In [24]:
query = '''drop table if exists qa_reporting.dw_mcrt_cpt_proc_counts;
select year, cpt_hcpcs_cd, count(*) as proc_count
into qa_reporting.dw_mcrt_cpt_proc_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
cpt_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrt_cpt_proc_counts;', con=connection)
cpt_proc_cd_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2015,62291,190
1,2020,KAGB1,408
2,2017,63272,21
3,2018,44361,795
4,2014,14301,2826
...,...,...,...
126792,2019,35638,24
126793,2019,27132,686
126794,2014,L6687,25
126795,2014,RMB12,29


Currently it may be hard to determine how many invalid CPT and HCPCS there in the data due to the list of codes we have for as a reference. There are two reference tables that can be used, **reference_tables.mrconso_cpt_hcpcs_hcpt** and **reference_tables.cpt_hcpc**

In [26]:
query = '''
select a.*
from qa_reporting.dw_mcrt_cpt_proc_counts a
left join (select distinct code from reference_tables.mrconso_cpt_hcpcs_hcpt) b
on a.cpt_hcpcs_cd = b.code
where b.code is null
;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2016,LC134,68
1,2014,3BHK2,3349
2,2014,LC134,66
3,2020,G9666,3317
4,2013,3CHM5,6
...,...,...,...
41880,2019,2AGMS,6
41881,2020,NEEC1,20
41882,2014,G9195,3
41883,2016,4CGK4,27


In [27]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2001          85
2002          84
2003           3
2004          13
2005          58
2006          23
2007          70
2008           4
2009          60
2010         255
2011         348
2012         799
2013     2090005
2014    66646607
2015    67119244
2016    55480691
2017    50185687
2018    43598614
2019    38591747
2020    32919686
Name: proc_count, dtype: int64

In [28]:
proc_comp_df = pd.DataFrame({'overall_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,88,85,3,2833.333
2002,86,84,2,4200.0
2003,3,3,0,inf
2004,15,13,2,650.0
2005,60,58,2,2900.0
2006,24,23,1,2300.0
2007,70,70,0,inf
2008,5,4,1,400.0
2009,60,60,0,inf
2010,260,255,5,5100.0


## Discharge Status

Checking if there are invalid discharge status codes.

Ignore lines with a NULL discharge status code. Most lines have no discharge status possibly due to raw data not providing this or other reasons.

In [39]:
query = '''drop table if exists qa_reporting.dw_mcrt_discharge_counts;
select year, discharge_status, count(*) as count
into qa_reporting.dw_mcrt_discharge_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [40]:
dschrg_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrt_discharge_counts;', con=connection)
dschrg_cd_df



Unnamed: 0,year,discharge_status,count
0,2002,63,33
1,2017,,230213536
2,2013,20,24687
3,2020,01,6515299
4,2015,87,52
...,...,...,...
387,2014,05,36850
388,2007,06,12
389,2013,,246862
390,2018,51,258976


In [41]:
dschrg_cd_df['discharge_status'].unique()

array(['63', None, '20', '01', '87', '30', '51', '71', '50', '62', '04',
       '81', '21', '83', '92', '86', '84', '07', '02', '06', '03', '64',
       '90', '41', '00', '82', '93', '05', '91', '08', '70', '42', '88',
       '40', '89', '72', '94', '69', '43', '85', '66', '95', '61', '65'],
      dtype=object)

In [42]:
dschrg_cd_df[dschrg_cd_df['discharge_status'].isna()]

Unnamed: 0,year,discharge_status,count
1,2017,,230213536
25,2001,,1
26,2010,,4
58,2005,,2
135,2006,,1
153,2014,,227976322
177,2002,,2
208,2008,,1
260,2019,,222549918
267,2015,,230072319


In [43]:
dschrg_cd_df = dschrg_cd_df[~dschrg_cd_df['discharge_status'].isna()]
dschrg_cd_df

Unnamed: 0,year,discharge_status,count
0,2002,63,33
2,2013,20,24687
3,2020,01,6515299
4,2015,87,52
5,2013,30,2365430
...,...,...,...
386,2019,21,2094
387,2014,05,36850
388,2007,06,12
390,2018,51,258976


In [44]:
query = '''
select a.*
from qa_reporting.dw_mcrt_discharge_counts a
left join reference_tables.ref_discharge_status b
on a.discharge_status = b.discharge_status
where b.discharge_status is null
and a.discharge_status is not null
;
'''

invalid_dschrg_df = pd.read_sql(query, con=connection)
invalid_dschrg_df



Unnamed: 0,year,discharge_status,count
0,2015,0,3199
1,2013,0,437
2,2019,0,2659
3,2020,0,206
4,2014,0,3046
5,2016,0,3206
6,2017,0,4428
7,2018,0,2392


In [45]:
dschrg_comp_df = pd.DataFrame({'overall_count': dschrg_cd_df.groupby('year')['count'].sum(),
                            'invalid_count': invalid_dschrg_df.groupby('year')['count'].sum(),
                            'valid_count': dschrg_cd_df.groupby('year')['count'].sum() - invalid_dschrg_df.groupby('year')['count'].sum()})
dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'valid_count'] = dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'overall_count']
dschrg_comp_df['valid_count'] =  dschrg_comp_df['valid_count'].astype(int)
dschrg_comp_df['invalid_to_valid_percent'] = 100. * dschrg_comp_df['invalid_count'] / dschrg_comp_df['valid_count']
dschrg_comp_df 

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,87,,87,
2002,84,,84,
2003,3,,3,
2004,13,,13,
2005,58,,58,
2006,23,,23,
2007,70,,70,
2008,4,,4,
2009,60,,60,
2010,256,,256,


## DRG CD

Checking DRG values in this column. There is no reference table to match the DRG values used in mcrten so cannot validate DRG codes in data.

In [46]:
query = '''drop table if exists qa_reporting.dw_mcrt_drg_counts;
select year, drg_cd, count(*) as drg_count
into qa_reporting.dw_mcrt_drg_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [47]:
drg_df = pd.read_sql('select * from qa_reporting.dw_mcrt_drg_counts;', con=connection)
drg_df



Unnamed: 0,year,drg_cd,drg_count
0,2016,471,8170
1,2013,471,94
2,2016,208,109679
3,2015,544,3321
4,2013,691,20
...,...,...,...
5941,2017,854,33906
5942,2018,987,20620
5943,2017,058,2283
5944,2016,536,36832


In [48]:
drg_df['drg_cd'].unique()

array(['471', '208', '544', '691', '686', '849', '394', '082', '556',
       '132', '542', '195', '326', '034', '295', '739', '259', '333',
       '572', '121', '165', '011', '834', '129', '916', '553', '620',
       '063', '600', '975', '190', '374', '740', '258', '642', '847',
       '596', '026', '845', '432', '745', '016', '264', '501', '168',
       '262', '728', '494', '571', '103', '191', '625', '822', '312',
       '075', '390', '303', '809', '830', '091', '472', '418', '462',
       '869', '655', '823', '671', '485', '770', '629', '414', '766',
       '821', '737', '270', '663', '902', '115', '901', '384', '514',
       '690', '369', '185', '581', '512', '217', '406', '637', '672',
       '037', '868', '505', '386', '619', '957', '695', '467', '187',
       '146', '150', '840', '153', '379', '291', '782', '872', '559',
       '585', '381', '722', '597', '603', '218', '302', '415', '644',
       '066', '866', '687', '604', '354', '820', '409', '052', '674',
       '779', '479',

In [49]:
drg_df[drg_df['drg_cd'].isna()]

Unnamed: 0,year,drg_cd,drg_count
236,2015,,261678020
386,2007,,10
559,2009,,15
634,2012,,477
695,2016,,261845399
964,2017,,260365766
1228,2006,,4
1301,2003,,3
1408,2014,,259133232
1555,2020,,218628824


In [50]:
comp = pd.merge(left=drg_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'drg_cd', 'drg_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('drg_count')

Unnamed: 0,year,drg_cd,drg_count,dw_row_count,dw_uth_clm_id_count
5930,2001,,1,88,6
5897,2011,885,2,349,42
1327,2013,887,2,3479005,253129
5938,2010,083,2,260,23
5937,2010,958,2,260,23
...,...,...,...,...,...
2329,2018,,252292549,266343773,78096488
4571,2014,,259133232,273755912,78590801
5253,2017,,260365766,275137577,80780364
1373,2015,,261678020,276242995,79304975


Many claims do not have DRG codes meaning that there will be a low percentage of drg to claim ratios. 

In [51]:
drg_sum_df = comp[~comp['drg_cd'].isna()].groupby(['year', 'dw_uth_clm_id_count'])['drg_count'].sum()
drg_sum_df = drg_sum_df.reset_index()
drg_sum_df['type_to_id'] = 1. * (drg_sum_df['drg_count'] / drg_sum_df['dw_uth_clm_id_count'])
drg_sum_df

Unnamed: 0,year,dw_uth_clm_id_count,drg_count,type_to_id
0,2001,6,87,14.5
1,2002,7,76,10.857143
2,2004,3,13,4.333333
3,2005,6,49,8.166667
4,2006,3,20,6.666667
5,2007,5,60,12.0
6,2009,7,45,6.428571
7,2010,23,158,6.869565
8,2011,42,179,4.261905
9,2012,125,332,2.656


## Provider Type

In [52]:
query = '''drop table if exists qa_reporting.dw_mcrt_provider_type_counts;
select year, provider_type, count(*) as type_count
into qa_reporting.dw_mcrt_provider_type_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [53]:
pt_df = pd.read_sql('select * from qa_reporting.dw_mcrt_provider_type_counts;', con=connection)
pt_df



Unnamed: 0,year,provider_type,type_count
0,2017,68,671251
1,2016,93,3389314
2,2017,08,11922363
3,2014,98,85913
4,2018,23,54617
...,...,...,...
705,2020,50,5858937
706,2020,85,10743
707,2016,05,1524500
708,2017,25,1411649


Most the provider specialties values in DW matches with the values Medicare uses.

In [55]:
query = '''
select a.*
from qa_reporting.dw_mcrt_provider_type_counts a
left join reference_tables.ref_provider_specialty b
on a.provider_type = b.provider_specialty_cd
where b.provider_specialty_cd is null
;
'''

invalid_pt_df = pd.read_sql(query, con=connection)
invalid_pt_df

Unnamed: 0,year,provider_type,type_count
0,2015,05,2162573
1,2017,05,1499528
2,2020,05,1282231
3,2019,05,1477092
4,2018,05,1490856
...,...,...,...
705,2020,76,2417
706,2016,76,5411
707,2019,76,2993
708,2015,76,8579


In [56]:
comp = pd.merge(left=pt_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'provider_type', 'type_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('type_count')

Unnamed: 0,year,provider_type,type_count,dw_row_count,dw_uth_clm_id_count
198,2014,54,1,273755912,78590801
682,2013,30,1,3479005,253129
683,2013,10,1,3479005,253129
684,2013,92,1,3479005,253129
686,2013,24,1,3479005,253129
...,...,...,...,...,...
345,2018,,134902338,266343773,78096488
279,2014,,135649388,273755912,78590801
653,2015,,137790115,276242995,79304975
12,2017,,139176269,275137577,80780364


Provider type is on the claim line level. Ideally we should have a provider type for almost all the claims.

In [57]:
pt_sum_df = comp[~comp['provider_type'].isna()].groupby(['year', 'dw_row_count'])['type_count'].sum()
pt_sum_df = pt_sum_df.reset_index()
pt_sum_df['type_to_id'] = 1. * (pt_sum_df['type_count'] / pt_sum_df['dw_row_count'])
pt_sum_df

Unnamed: 0,year,dw_row_count,type_count,type_to_id
0,2013,3479005,109,3.1e-05
1,2014,273755912,138106524,0.504488
2,2015,276242995,138452880,0.5012
3,2016,276518957,136536233,0.493768
4,2017,275137577,135961308,0.494158
5,2018,266343773,131441435,0.493503
6,2019,263484278,131165013,0.49781
7,2020,230157293,115089898,0.500049
