# Data Warehouse Medicare National QA - Claim Detail

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrn_claim_detail_counts;
create table qa_reporting.dw_mcrn_claim_detail_counts
(
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrn_claim_detail_counts
    (calendar_year, table_src, dw_row_count, date_generated)
    select year, 'ALL' table_id_src, count(*), current_date
    from dw_staging.mcrn_claim_detail
    group by 1
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_detail_counts b
    set dw_uth_clm_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_claim_id) as count 
        from dw_staging.mcrn_claim_detail
    group by 1 ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_detail_counts b
    set dw_src_clm_id_count = clm_count,
        dw_src_mbr_id_count = mbr_count
    from (
        select year, 'ALL' table_id_src,  count(distinct claim_id_src) as clm_count, count(distinct member_id_src) as mbr_count 
        from dw_staging.mcrn_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_detail_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_member_id) as count 
        from dw_staging.mcrn_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    query = '''    
    with clms as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.hha_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.outpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.dme_line_k
        union all
        select extract(year from clm_thru_dt::date)as year, bene_id, clm_id
        from medicare_national.inpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.bcarrier_line_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.hospice_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.snf_revenue_center_k
    ),
    clm_counts as (
        select year, count(*) row_count, count(distinct bene_id) pat_count, count(distinct clm_id) clm_count
        from clms
        group by 1
    )
    update qa_reporting.dw_mcrn_claim_detail_counts a
    set src_row_count = b.row_count,
        row_count_diff = dw_row_count - b.row_count,
        row_count_diff_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
        src_clm_count = clm_count,
        clm_count_diff = dw_uth_clm_id_count - b.clm_count,
        clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
        src_mbr_count = b.pat_count,
        mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
        mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from clm_counts b
    where a.calendar_year = b.year
    ;
    '''

    cursor.execute(query)

In [6]:
query = '''select * from qa_reporting.dw_mcrn_claim_detail_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values('calendar_year')



Unnamed: 0,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
19,1997,ALL,19,,,,3,3,,,,1,1,,,,2023-12-18
1,2000,ALL,22,,,,2,2,,,,2,2,,,,2023-12-18
7,2001,ALL,42,,,,4,4,,,,4,4,,,,2023-12-18
3,2002,ALL,89,,,,4,4,,,,4,4,,,,2023-12-18
6,2003,ALL,8,,,,3,3,,,,1,1,,,,2023-12-18
20,2004,ALL,86,,,,10,10,,,,10,10,,,,2023-12-18
17,2005,ALL,1,,,,1,1,,,,1,1,,,,2023-12-18
11,2006,ALL,58,,,,5,5,,,,5,5,,,,2023-12-18
22,2007,ALL,54,,,,4,4,,,,4,4,,,,2023-12-18
4,2008,ALL,62,,,,7,7,,,,7,7,,,,2023-12-18


In [7]:
df.sort_values('row_count_diff_percentage', ascending=False)[['calendar_year', 'table_src', 'row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage']]

Unnamed: 0,calendar_year,table_src,row_count_diff_percentage,clm_count_percentage,mbr_count_percentage
5,2021,ALL,0.362851,0.143245,0.024604
2,2020,ALL,0.204545,0.016322,0.036812
10,2019,ALL,0.04754,0.010953,0.025485
8,2014,ALL,0.038215,0.033977,0.034374
0,2018,ALL,0.029162,0.021332,0.038777
15,2017,ALL,0.020307,0.006128,0.019838
21,2015,ALL,0.01312,0.003552,0.027537
18,2016,ALL,0.006836,0.003472,0.022952
1,2000,ALL,,,
3,2002,ALL,,,


## Place of Service

In [8]:
query = '''drop table if exists qa_reporting.dw_mcrn_pos;
select year, place_of_service, count(*)
into qa_reporting.dw_mcrn_pos
from dw_staging.mcrn_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [9]:
pos_df = pd.read_sql('select * from qa_reporting.dw_mcrn_pos;', con=connection)
pos_df



Unnamed: 0,year,place_of_service,count
0,2016,07,52
1,2017,16,1333
2,2019,17,8023
3,2020,54,14999
4,2017,65,193319
...,...,...,...
496,2017,17,4429
497,2014,54,18664
498,1997,1,19
499,2017,11,50651217


In [10]:
pos_df['place_of_service'].unique()

array(['07', '16', '17', '54', '65', '04', '56', '11', '49', '24', '10',
       '7', '61', '60', '09', '2', '41', '14', '1', '20', '42', '00',
       '26', '8', '55', '08', '53', '50', '34', '15', '3', '13', '21',
       '05', '02', '18', '33', '99', '62', '4', '06', '57', '19', '31',
       '58', '51', '81', '01', '12', '22', '29', '23', '03', '32', '72',
       '71', '52', '95', '77', '44', '76', '25', '27', '90', '37', '89',
       '70', '87', '30'], dtype=object)

In [11]:
query = '''
select a.* 
from qa_reporting.dw_mcrn_pos a
left join reference_tables.ref_place_of_service b
on lpad(a.place_of_service, 2, '0') = b.place_of_treatment_cd
where b.place_of_treatment_cd is null
;
'''
pos_df = pd.read_sql(query, con=connection)
pos_df.sort_values(['year', 'place_of_service'])



Unnamed: 0,year,place_of_service,count
4,2014,0,56
5,2015,0,22
6,2016,0,21
1,2017,0,19
3,2018,0,13
7,2019,0,16
2,2020,0,17
0,2021,0,14


## Revenue Code

Looking at how many claims have invalid revenue codes.

In [12]:
query = '''drop table if exists qa_reporting.dw_mcrn_revenue_cd;
select year, revenue_cd, count(*)
into qa_reporting.dw_mcrn_revenue_cd
from dw_staging.mcrn_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
query = '''
select a.* 
from qa_reporting.dw_mcrn_revenue_cd a
left join reference_tables.ref_revenue_code b
on a.revenue_cd = b.revenue_cd
where b.revenue_cd is null
and a.revenue_cd is not null
;
'''
invalid_rev_cd_df = pd.read_sql(query, con=connection)
invalid_rev_cd_df.sort_values(['year', 'revenue_cd'])



Unnamed: 0,year,revenue_cd,count
237,2010,0815,1
127,2013,0184,1
205,2013,0451,115
12,2013,0909,1
100,2013,1551,1
...,...,...,...
249,2021,0937,1
160,2021,0948,15299
115,2021,2300,1
147,2021,4020,1


In [14]:
pd.DataFrame(invalid_rev_cd_df['revenue_cd'].unique())

Unnamed: 0,0
0,0751
1,0870
2,0445
3,0012
4,0554
...,...
152,0030
153,3300
154,2333
155,0638


In [15]:
invalid_rev_cd_sum = invalid_rev_cd_df.groupby('year').sum()
invalid_rev_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2010,1
2013,118
2014,49276
2015,52068
2016,52068
2017,53182
2018,45524
2019,33621
2020,16637
2021,18963


In [16]:
rev_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrn_revenue_cd group by 1', con=connection)
rev_cd_df['sum'] = rev_cd_df['sum'].astype(int)
rev_cd_df = rev_cd_df.set_index('year')
rev_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,187692363
2000,22
2020,163317753
2002,89
2008,62
2021,168224580
2003,8
2001,42
2014,183386964
2009,68


We should have a very low percentage of claim lines that have revenue codes with invalid revenue codes. If the percent if > 1%, need to investigate further

In [17]:
100. * invalid_rev_cd_sum['count'] / rev_cd_df['sum']

year
1997         NaN
2000         NaN
2001         NaN
2002         NaN
2003         NaN
2004         NaN
2005         NaN
2006         NaN
2007         NaN
2008         NaN
2009         NaN
2010    0.487805
2011         NaN
2012         NaN
2013    0.009575
2014    0.026870
2015    0.027819
2016    0.027658
2017    0.028492
2018    0.024255
2019    0.018040
2020    0.010187
2021    0.011272
2022         NaN
dtype: float64

## Bill Type Code

Looking at how many claims have invalid bill type codes. This occurs when the claim has a 2 character bill type code in the raw data.

In [18]:
query = '''drop table if exists qa_reporting.dw_mcrn_bill_cd;
select year, bill_type_inst || bill_type_class || bill_type_freq as bill_cd, count(*)
into qa_reporting.dw_mcrn_bill_cd
from dw_staging.mcrn_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [19]:
query = '''
select a.* 
from qa_reporting.dw_mcrn_bill_cd a
left join reference_tables.ref_bill_type_cd b
on a.bill_cd = b.bill_type_cd
where b.bill_type_cd is null
and a.bill_cd is not null
;
'''
invalid_bill_cd_df = pd.read_sql(query, con=connection)
invalid_bill_cd_df.sort_values(['year', 'bill_cd'])



Unnamed: 0,year,bill_cd,count


In [20]:
pd.DataFrame(invalid_bill_cd_df['bill_cd'].unique())

Unnamed: 0,0


In [21]:
invalid_bill_cd_sum = invalid_bill_cd_df.groupby('year').sum()
invalid_bill_cd_sum

In [22]:
bill_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrn_bill_cd group by 1', con=connection)
bill_cd_df['sum'] = bill_cd_df['sum'].astype(int)
bill_cd_df = bill_cd_df.set_index('year')
bill_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,187692363
2000,22
2020,163317753
2002,89
2008,62
2021,168224580
2003,8
2001,42
2014,183386964
2009,68


We should have a very low percentage of claim lines that have bill type codes with invalid bill type codes. If the percent if > 1%, need to investigate further

In [23]:
100. * (invalid_bill_cd_sum['count'] if invalid_bill_cd_sum.shape[0] > 0  else 0) / bill_cd_df['sum']

year
2018    0.0
2000    0.0
2020    0.0
2002    0.0
2008    0.0
2021    0.0
2003    0.0
2001    0.0
2014    0.0
2009    0.0
2019    0.0
2006    0.0
2011    0.0
2012    0.0
2022    0.0
2017    0.0
2013    0.0
2005    0.0
2016    0.0
1997    0.0
2004    0.0
2007    0.0
2010    0.0
2015    0.0
Name: sum, dtype: float64

## CPT HCPCS Code

In [24]:
query = '''drop table if exists qa_reporting.dw_mcrn_cpt_proc_counts;
select year, cpt_hcpcs_cd, count(*) as proc_count
into qa_reporting.dw_mcrn_cpt_proc_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
cpt_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrn_cpt_proc_counts;', con=connection)
cpt_proc_cd_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2021,IGUF1,15
1,2014,A4269,1
2,2020,JHHC1,1
3,2017,01744,299
4,2021,L2232,55
...,...,...,...
147757,2019,RHC2D,152
147758,2015,88291,4148
147759,2017,L6895,8
147760,2015,J1205,123


Currently it may be hard to determine how many invalid CPT and HCPCS there in the data due to the list of codes we have for as a reference. There are two reference tables that can be used, **reference_tables.mrconso_cpt_hcpcs_hcpt** and **reference_tables.cpt_hcpc**

In [26]:
query = '''
select a.*
from qa_reporting.dw_mcrn_cpt_proc_counts a
left join (select distinct code from reference_tables.mrconso_cpt_hcpcs_hcpt) b
on a.cpt_hcpcs_cd = b.code
where b.code is null
;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2014,1CHPT,282
1,2016,36515,5
2,2020,KFAB1,9
3,2019,GBGF1,16
4,2020,ODJC1,8
...,...,...,...
47776,2016,PB204,2
47777,2019,3BGLS,36
47778,2016,3CGP2,45
47779,2021,AFDF1,1


In [27]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
1997          19
2000          22
2001          41
2002          89
2003           8
2004          82
2006          58
2007          53
2008          62
2009          68
2010         204
2011         471
2012         868
2013      672725
2014    42802089
2015    43691582
2016    38603580
2017    35995785
2018    32560823
2019    29375210
2020    24864186
2021    25276458
2022        1788
Name: proc_count, dtype: int64

In [28]:
proc_comp_df = pd.DataFrame({'overall_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997,19,19.0,0,inf
2000,22,22.0,0,inf
2001,42,41.0,1,4100.0
2002,89,89.0,0,inf
2003,8,8.0,0,inf
2004,86,82.0,4,2050.0
2005,1,,1,
2006,58,58.0,0,inf
2007,54,53.0,1,5300.0
2008,62,62.0,0,inf


## Discharge Status

Checking if there are invalid discharge status codes.

Ignore lines with a NULL discharge status code. Most lines have no discharge status possibly due to raw data not providing this or other reasons.

In [29]:
query = '''drop table if exists qa_reporting.dw_mcrn_discharge_counts;
select year, discharge_status, count(*) as count
into qa_reporting.dw_mcrn_discharge_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [30]:
dschrg_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrn_discharge_counts;', con=connection)
dschrg_cd_df



Unnamed: 0,year,discharge_status,count
0,2018,42,9115
1,2018,05,20215
2,2015,87,44
3,2001,01,14
4,2001,62,19
...,...,...,...
426,2010,02,55
427,2013,51,5011
428,2020,01,4680934
429,2013,,162902


In [31]:
dschrg_cd_df['discharge_status'].unique()

array(['42', '05', '87', '01', '62', '20', '41', '40', '30', '71', None,
       '82', '50', '86', '08', '91', '85', '84', '69', '70', '63', '66',
       '02', '03', '06', '89', '64', '61', '04', '93', '92', '83', '94',
       '00', '51', '95', '43', '81', '07', '88', '90', '21', '65', '72'],
      dtype=object)

In [32]:
dschrg_cd_df[dschrg_cd_df['discharge_status'].isna()]

Unnamed: 0,year,discharge_status,count
10,2001,,1
11,2005,,1
40,2007,,1
93,2012,,5
104,2010,,1
139,2019,,163297186
159,2014,,159969773
162,2015,,163424422
170,2004,,4
194,2018,,164020154


In [33]:
dschrg_cd_df = dschrg_cd_df[~dschrg_cd_df['discharge_status'].isna()]
dschrg_cd_df

Unnamed: 0,year,discharge_status,count
0,2018,42,9115
1,2018,05,20215
2,2015,87,44
3,2001,01,14
4,2001,62,19
...,...,...,...
425,2017,95,258
426,2010,02,55
427,2013,51,5011
428,2020,01,4680934


In [34]:
query = '''
select a.*
from qa_reporting.dw_mcrn_discharge_counts a
left join reference_tables.ref_discharge_status b
on a.discharge_status = b.discharge_status
where b.discharge_status is null
and a.discharge_status is not null
;
'''

invalid_dschrg_df = pd.read_sql(query, con=connection)
invalid_dschrg_df



Unnamed: 0,year,discharge_status,count
0,2013,0,56
1,2016,0,1167
2,2018,0,1288
3,2014,0,1783
4,2015,0,1027
5,2020,0,147
6,2019,0,1788
7,2017,0,1705


In [35]:
dschrg_comp_df = pd.DataFrame({'overall_count': dschrg_cd_df.groupby('year')['count'].sum(),
                            'invalid_count': invalid_dschrg_df.groupby('year')['count'].sum(),
                            'valid_count': dschrg_cd_df.groupby('year')['count'].sum() - invalid_dschrg_df.groupby('year')['count'].sum()})
dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'valid_count'] = dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'overall_count']
dschrg_comp_df['valid_count'] =  dschrg_comp_df['valid_count'].astype(int)
dschrg_comp_df['invalid_to_valid_percent'] = 100. * dschrg_comp_df['invalid_count'] / dschrg_comp_df['valid_count']
dschrg_comp_df 

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997,19,,19,
2000,22,,22,
2001,41,,41,
2002,89,,89,
2003,8,,8,
2004,82,,82,
2006,58,,58,
2007,53,,53,
2008,62,,62,
2009,68,,68,


## DRG CD

Checking DRG values in this column. There is no reference table to match the DRG values used in mcrnen so cannot validate DRG codes in data.

In [36]:
query = '''drop table if exists qa_reporting.dw_mcrn_drg_counts;
select year, drg_cd, count(*) as drg_count
into qa_reporting.dw_mcrn_drg_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [37]:
drg_df = pd.read_sql('select * from qa_reporting.dw_mcrn_drg_counts;', con=connection)
drg_df



Unnamed: 0,year,drg_cd,drg_count
0,2015,461,546
1,2020,471,4682
2,2020,083,9898
3,2016,713,5389
4,2020,192,9303
...,...,...,...
6650,2021,260,3250
6651,2017,615,1233
6652,2020,472,14061
6653,2020,870,63768


In [38]:
drg_df['drg_cd'].unique()

array(['461', '471', '083', '713', '192', '395', '607', '377', '100',
       '215', '592', '311', '006', '012', '948', '906', '934', '473',
       '463', '807', '373', '204', '835', '880', None, '472', '928',
       '233', '139', '273', '500', '904', '071', '202', '075', '882',
       '382', '572', '849', '089', '254', '263', '495', '158', '078',
       '766', '098', '800', '222', '268', '718', '125', '094', '685',
       '343', '847', '252', '177', '885', '652', '117', '813', '582',
       '032', '152', '977', '232', '872', '805', '371', '462', '717',
       '259', '241', '887', '407', '159', '747', '411', '137', '423',
       '858', '929', '786', '248', '641', '054', '434', '740', '344',
       '034', '300', '644', '566', '055', '183', '743', '741', '140',
       '422', '134', '004', '856', '981', '768', '221', '601', '093',
       '096', '454', '682', '420', '570', '814', '388', '333', '274',
       '303', '669', '596', '082', '036', '016', '289', '913', '352',
       '984', '101', 

In [39]:
drg_df[drg_df['drg_cd'].isna()]

Unnamed: 0,year,drg_cd,drg_count
25,2009,,30
181,2004,,15
237,2017,,176940187
331,2006,,12
1657,2002,,13
2429,2005,,1
2715,2008,,15
3011,2016,,178615084
3323,2015,,177507678
3629,2014,,173710381


In [40]:
comp = pd.merge(left=drg_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'drg_cd', 'drg_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('drg_count')

Unnamed: 0,year,drg_cd,drg_count,dw_row_count,dw_uth_clm_id_count
6646,2005,,1,1,1
6629,2007,,1,54,4
6597,2004,000,2,86,10
6314,2013,951,2,1232368,120639
6635,2010,881,2,205,40
...,...,...,...,...,...
2295,2017,,176940187,186657218,59674212
5837,2019,,177099799,186367654,60543423
370,2015,,177507678,187164080,58864318
3462,2018,,178178606,187692363,60197623


Many claims do not have DRG codes meaning that there will be a low percentage of drg to claim ratios. 

In [41]:
drg_sum_df = comp[~comp['drg_cd'].isna()].groupby(['year', 'dw_uth_clm_id_count'])['drg_count'].sum()
drg_sum_df = drg_sum_df.reset_index()
drg_sum_df['type_to_id'] = 1. * (drg_sum_df['drg_count'] / drg_sum_df['dw_uth_clm_id_count'])
drg_sum_df

Unnamed: 0,year,dw_uth_clm_id_count,drg_count,type_to_id
0,1997,3,19,6.333333
1,2000,2,12,6.0
2,2001,4,33,8.25
3,2002,4,76,19.0
4,2003,3,8,2.666667
5,2004,10,71,7.1
6,2006,5,46,9.2
7,2007,4,53,13.25
8,2008,7,47,6.714286
9,2009,9,38,4.222222


## Provider Type

In [42]:
query = '''drop table if exists qa_reporting.dw_mcrn_provider_type_counts;
select year, provider_type, count(*) as type_count
into qa_reporting.dw_mcrn_provider_type_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [43]:
pt_df = pd.read_sql('select * from qa_reporting.dw_mcrn_provider_type_counts;', con=connection)
pt_df



Unnamed: 0,year,provider_type,type_count
0,2019,87,5334
1,2020,50,4433706
2,2020,82,61794
3,2017,06,4025204
4,2014,A5,78
...,...,...,...
823,2018,07,1863401
824,2020,09,269366
825,2015,83,2031305
826,2020,72,321109


Most the provider specialties values in DW matches with the values Medicare uses.

In [44]:
query = '''
select a.*
from qa_reporting.dw_mcrn_provider_type_counts a
left join reference_tables.ref_provider_specialty b
on a.provider_type = b.provider_specialty_cd
where b.provider_specialty_cd is null
;
'''

invalid_pt_df = pd.read_sql(query, con=connection)
invalid_pt_df



Unnamed: 0,year,provider_type,type_count
0,2015,36,47707
1,2014,36,46269
2,2021,36,45536
3,2018,C5,431
4,2017,C5,294
...,...,...,...
823,2018,97,2288050
824,2019,10,882336
825,2014,10,922960
826,2014,97,1538420


In [45]:
comp = pd.merge(left=pt_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'provider_type', 'type_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('type_count')

Unnamed: 0,year,provider_type,type_count,dw_row_count,dw_uth_clm_id_count
615,2013,05,1,1232368,120639
616,2013,35,1,1232368,120639
498,2021,54,1,168224580,58325505
821,2005,,1,1,1
380,2014,31,2,183386964,58037856
...,...,...,...,...,...
653,2015,,89385169,187164080,58864318
538,2016,,91177315,188258241,59668370
37,2019,,91626517,186367654,60543423
300,2017,,91744010,186657218,59674212


Provider type is on the claim line level. Ideally we should have a provider type for almost all the claims.

In [46]:
pt_sum_df = comp[~comp['provider_type'].isna()].groupby(['year', 'dw_row_count'])['type_count'].sum()
pt_sum_df = pt_sum_df.reset_index()
pt_sum_df['type_to_id'] = 1. * (pt_sum_df['type_count'] / pt_sum_df['dw_row_count'])
pt_sum_df

Unnamed: 0,year,dw_row_count,type_count,type_to_id
0,2013,1232368,723,0.000587
1,2014,183386964,96203108,0.524591
2,2015,187164080,97778911,0.522423
3,2016,188258241,97080926,0.51568
4,2017,186657218,94913208,0.508489
5,2018,187692363,95155413,0.506975
6,2019,186367654,94741137,0.508356
7,2020,163317753,82153530,0.503029
8,2021,168224580,87893144,0.522475
