# Data Warehouse Medicare National QA - Claim Detail

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrn_claim_detail_counts;
create table qa_reporting.dw_mcrn_claim_detail_counts
(
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrn_claim_detail_counts
    (calendar_year, table_src, dw_row_count, date_generated)
    select year, 'ALL' table_id_src, count(*), current_date
    from dw_staging.mcrn_claim_detail
    group by 1
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_detail_counts b
    set dw_uth_clm_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_claim_id) as count 
        from dw_staging.mcrn_claim_detail
    group by 1 ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_detail_counts b
    set dw_src_clm_id_count = clm_count,
        dw_src_mbr_id_count = mbr_count
    from (
        select year, 'ALL' table_id_src,  count(distinct claim_id_src) as clm_count, count(distinct member_id_src) as mbr_count 
        from dw_staging.mcrn_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrn_claim_detail_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_member_id) as count 
        from dw_staging.mcrn_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

In [5]:
with connection.cursor() as cursor:
    query = '''    
    with clms as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.hha_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.outpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.dme_line_k
        union all
        select extract(year from clm_thru_dt::date)as year, bene_id, clm_id
        from medicare_national.inpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.bcarrier_line_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.hospice_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_national.snf_revenue_center_k
    ),
    clm_counts as (
        select year, count(*) row_count, count(distinct bene_id) pat_count, count(distinct clm_id) clm_count
        from clms
        group by 1
    )
    update qa_reporting.dw_mcrn_claim_detail_counts a
    set src_row_count = b.row_count,
        row_count_diff = dw_row_count - b.row_count,
        row_count_diff_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
        src_clm_count = clm_count,
        clm_count_diff = dw_uth_clm_id_count - b.clm_count,
        clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
        src_mbr_count = b.pat_count,
        mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
        mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from clm_counts b
    where a.calendar_year = b.year
    ;
    '''

    cursor.execute(query)

In [6]:
query = '''select * from qa_reporting.dw_mcrn_claim_detail_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values('calendar_year')



Unnamed: 0,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
17,1997,ALL,19,,,,3,3,,,,1,1,,,,2023-10-09
1,2000,ALL,22,,,,2,2,,,,2,2,,,,2023-10-09
6,2001,ALL,9,,,,2,2,,,,2,2,,,,2023-10-09
3,2002,ALL,89,,,,4,4,,,,4,4,,,,2023-10-09
5,2003,ALL,8,,,,3,3,,,,1,1,,,,2023-10-09
18,2004,ALL,86,,,,10,10,,,,10,10,,,,2023-10-09
15,2005,ALL,1,,,,1,1,,,,1,1,,,,2023-10-09
10,2006,ALL,43,,,,4,4,,,,4,4,,,,2023-10-09
20,2007,ALL,54,,,,4,4,,,,4,4,,,,2023-10-09
4,2008,ALL,62,,,,7,7,,,,7,7,,,,2023-10-09


In [7]:
df.sort_values('row_count_diff_percentage', ascending=False)[['calendar_year', 'table_src', 'row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage']]

Unnamed: 0,calendar_year,table_src,row_count_diff_percentage,clm_count_percentage,mbr_count_percentage
2,2020,ALL,0.580289,0.13804,0.050522
9,2019,ALL,0.047951,0.011179,0.027421
7,2014,ALL,0.038229,0.03398,0.034374
0,2018,ALL,0.029165,0.021335,0.038777
13,2017,ALL,0.020313,0.006131,0.019838
19,2015,ALL,0.01312,0.003552,0.027537
16,2016,ALL,0.006836,0.003472,0.022952
1,2000,ALL,,,
3,2002,ALL,,,
4,2008,ALL,,,


## Place of Service

In [8]:
query = '''drop table if exists qa_reporting.dw_mcrn_pos;
select year, place_of_service, count(*)
into qa_reporting.dw_mcrn_pos
from dw_staging.mcrn_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [9]:
pos_df = pd.read_sql('select * from qa_reporting.dw_mcrn_pos;', con=connection)
pos_df



Unnamed: 0,year,place_of_service,count
0,2016,33,90171
1,2014,77,2
2,2018,11,50736605
3,2018,53,59215
4,2018,31,1492281
...,...,...,...
436,2018,61,137600
437,2019,81,13383015
438,2014,2,6129741
439,2014,00,56


In [10]:
pos_df['place_of_service'].unique()

array(['33', '77', '11', '53', '31', '01', '55', '34', '72', '07', '27',
       '09', '90', '62', '61', '12', '19', '16', '41', '3', '56', '4',
       '26', '81', '10', '51', '44', '42', '1', '71', '04', '52', '22',
       '57', '60', '00', '50', '54', '17', '15', '08', '2', '99', '13',
       '02', '06', '23', '03', '05', '29', '14', '24', '49', '32', '95',
       '8', '20', '21', '18', '65', '7', '25', '87', '30', '58', '89',
       '70', '37'], dtype=object)

In [11]:
query = '''
select a.* 
from qa_reporting.dw_mcrn_pos a
left join reference_tables.ref_place_of_service b
on lpad(a.place_of_service, 2, '0') = b.place_of_treatment_cd
where b.place_of_treatment_cd is null
;
'''
pos_df = pd.read_sql(query, con=connection)
pos_df.sort_values(['year', 'place_of_service'])



Unnamed: 0,year,place_of_service,count
3,2014,0,56
5,2015,0,22
1,2016,0,21
0,2017,0,19
4,2018,0,13
2,2019,0,16
6,2020,0,15


## Revenue Code

Looking at how many claims have invalid revenue codes.

In [12]:
query = '''drop table if exists qa_reporting.dw_mcrn_revenue_cd;
select year, revenue_cd, count(*)
into qa_reporting.dw_mcrn_revenue_cd
from dw_staging.mcrn_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
query = '''
select a.* 
from qa_reporting.dw_mcrn_revenue_cd a
left join reference_tables.ref_revenue_code b
on a.revenue_cd = b.revenue_cd
where b.revenue_cd is null
and a.revenue_cd is not null
;
'''
invalid_rev_cd_df = pd.read_sql(query, con=connection)
invalid_rev_cd_df.sort_values(['year', 'revenue_cd'])



Unnamed: 0,year,revenue_cd,count
13,2010,0815,1
90,2013,0184,1
27,2013,0451,115
210,2013,0909,1
60,2013,1551,1
...,...,...,...
167,2020,2636,1
56,2020,3224,1
184,2020,3950,1
144,2020,4120,1


In [14]:
pd.DataFrame(invalid_rev_cd_df['revenue_cd'].unique())

Unnamed: 0,0
0,0980
1,0428
2,0537
3,3259
4,0004
...,...
143,0445
144,0870
145,0012
146,0554


In [15]:
invalid_rev_cd_sum = invalid_rev_cd_df.groupby('year').sum()
invalid_rev_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2010,1
2013,118
2014,49276
2015,52068
2016,52068
2017,53182
2018,45524
2019,33621
2020,16538


In [16]:
rev_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrn_revenue_cd group by 1', con=connection)
rev_cd_df['sum'] = rev_cd_df['sum'].astype(int)
rev_cd_df = rev_cd_df.set_index('year')
rev_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,187692356
2000,22
2020,161857564
2002,89
2008,62
2003,8
2009,68
2001,9
2014,183386940
2006,43


We should have a very low percentage of claim lines that have revenue codes with invalid revenue codes. If the percent if > 1%, need to investigate further

In [17]:
100. * invalid_rev_cd_sum['count'] / rev_cd_df['sum']

year
1997         NaN
2000         NaN
2001         NaN
2002         NaN
2003         NaN
2004         NaN
2005         NaN
2006         NaN
2007         NaN
2008         NaN
2009         NaN
2010    0.487805
2011         NaN
2012         NaN
2013    0.009575
2014    0.026870
2015    0.027819
2016    0.027658
2017    0.028492
2018    0.024255
2019    0.018040
2020    0.010218
dtype: float64

## Bill Type Code

Looking at how many claims have invalid bill type codes. This occurs when the claim has a 2 character bill type code in the raw data.

In [18]:
query = '''drop table if exists qa_reporting.dw_mcrn_bill_cd;
select year, bill_type_inst || bill_type_class || bill_type_freq as bill_cd, count(*)
into qa_reporting.dw_mcrn_bill_cd
from dw_staging.mcrn_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [19]:
query = '''
select a.* 
from qa_reporting.dw_mcrn_bill_cd a
left join reference_tables.ref_bill_type_cd b
on a.bill_cd = b.bill_type_cd
where b.bill_type_cd is null
and a.bill_cd is not null
;
'''
invalid_bill_cd_df = pd.read_sql(query, con=connection)
invalid_bill_cd_df.sort_values(['year', 'bill_cd'])



Unnamed: 0,year,bill_cd,count


In [20]:
pd.DataFrame(invalid_bill_cd_df['bill_cd'].unique())

Unnamed: 0,0


In [21]:
invalid_bill_cd_sum = invalid_bill_cd_df.groupby('year').sum()
invalid_bill_cd_sum

In [22]:
bill_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrn_bill_cd group by 1', con=connection)
bill_cd_df['sum'] = bill_cd_df['sum'].astype(int)
bill_cd_df = bill_cd_df.set_index('year')
bill_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,187692356
2000,22
2020,161857564
2002,89
2003,8
2008,62
2001,9
2014,183386940
2009,68
2019,186366888


We should have a very low percentage of claim lines that have bill type codes with invalid bill type codes. If the percent if > 1%, need to investigate further

In [23]:
100. * (invalid_bill_cd_sum['count'] if invalid_bill_cd_sum.shape[0] > 0  else 0) / bill_cd_df['sum']

year
2018    0.0
2000    0.0
2020    0.0
2002    0.0
2003    0.0
2008    0.0
2001    0.0
2014    0.0
2009    0.0
2019    0.0
2006    0.0
2011    0.0
2012    0.0
2013    0.0
2004    0.0
2017    0.0
2005    0.0
2016    0.0
1997    0.0
2007    0.0
2010    0.0
2015    0.0
Name: sum, dtype: float64

## CPT HCPCS Code

In [24]:
query = '''drop table if exists qa_reporting.dw_mcrn_cpt_proc_counts;
select year, cpt_hcpcs_cd, count(*) as proc_count
into qa_reporting.dw_mcrn_cpt_proc_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
cpt_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrn_cpt_proc_counts;', con=connection)
cpt_proc_cd_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2019,A0402,12
1,2017,61313,81
2,2018,97810,6394
3,2018,69641,65
4,2018,91038,462
...,...,...,...
126387,2015,35721,43
126388,2015,LE211,9
126389,2014,36568,12
126390,2015,RHA5D,5


Currently it may be hard to determine how many invalid CPT and HCPCS there in the data due to the list of codes we have for as a reference. There are two reference tables that can be used, **reference_tables.mrconso_cpt_hcpcs_hcpt** and **reference_tables.cpt_hcpc**

In [26]:
query = '''
select a.*
from qa_reporting.dw_mcrn_cpt_proc_counts a
left join (select distinct code from reference_tables.mrconso_cpt_hcpcs_hcpt) b
on a.cpt_hcpcs_cd = b.code
where b.code is null
;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2019,JHEF0,1
1,2018,3BGM1,741
2,2017,69035,1
3,2011,,403
4,2020,,21649585
...,...,...,...
39443,2018,G8627,5
39444,2016,RHL40,6
39445,2019,M1001,63
39446,2020,GGFC0,2


In [27]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
1997          19
2000          22
2001           8
2002          89
2003           8
2004          82
2006          43
2007          53
2008          62
2009          68
2010         204
2011         471
2012         864
2013      672725
2014    42802065
2015    43691582
2016    38603580
2017    35995775
2018    32560816
2019    29374699
2020    24533965
Name: proc_count, dtype: int64

In [28]:
proc_comp_df = pd.DataFrame({'overall_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997,19,19.0,0,inf
2000,22,22.0,0,inf
2001,9,8.0,1,800.0
2002,89,89.0,0,inf
2003,8,8.0,0,inf
2004,86,82.0,4,2050.0
2005,1,,1,
2006,43,43.0,0,inf
2007,54,53.0,1,5300.0
2008,62,62.0,0,inf


## Discharge Status

Checking if there are invalid discharge status codes.

Ignore lines with a NULL discharge status code. Most lines have no discharge status possibly due to raw data not providing this or other reasons.

In [29]:
query = '''drop table if exists qa_reporting.dw_mcrn_discharge_counts;
select year, discharge_status, count(*) as count
into qa_reporting.dw_mcrn_discharge_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [30]:
dschrg_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrn_discharge_counts;', con=connection)
dschrg_cd_df



Unnamed: 0,year,discharge_status,count
0,2014,06,1970658
1,2019,43,4661
2,2015,04,131725
3,2017,41,444866
4,2017,65,35139
...,...,...,...
384,2015,88,136
385,2014,70,27790
386,2017,92,404
387,2019,84,200


In [31]:
dschrg_cd_df['discharge_status'].unique()

array(['06', '43', '04', '41', '65', '51', '83', '01', '71', '86', '61',
       '84', '95', '30', '85', '70', None, '50', '87', '90', '02', '64',
       '63', '03', '81', '82', '62', '20', '91', '21', '40', '05', '88',
       '08', '42', '72', '69', '94', '92', '07', '66', '00', '89', '93'],
      dtype=object)

In [32]:
dschrg_cd_df[dschrg_cd_df['discharge_status'].isna()]

Unnamed: 0,year,discharge_status,count
21,2011,,1
43,2012,,5
54,2014,,159969773
77,2007,,1
92,2004,,4
108,2020,,142226629
124,2013,,162902
135,2001,,1
141,2019,,163297141
148,2018,,164020154


In [33]:
dschrg_cd_df = dschrg_cd_df[~dschrg_cd_df['discharge_status'].isna()]
dschrg_cd_df

Unnamed: 0,year,discharge_status,count
0,2014,06,1970658
1,2019,43,4661
2,2015,04,131725
3,2017,41,444866
4,2017,65,35139
...,...,...,...
384,2015,88,136
385,2014,70,27790
386,2017,92,404
387,2019,84,200


In [34]:
query = '''
select a.*
from qa_reporting.dw_mcrn_discharge_counts a
left join reference_tables.ref_discharge_status b
on a.discharge_status = b.discharge_status
where b.discharge_status is null
and a.discharge_status is not null
;
'''

invalid_dschrg_df = pd.read_sql(query, con=connection)
invalid_dschrg_df



Unnamed: 0,year,discharge_status,count
0,2015,0,1027
1,2014,0,1783
2,2013,0,56
3,2020,0,147
4,2018,0,1288
5,2016,0,1167
6,2019,0,1788
7,2017,0,1705


In [35]:
dschrg_comp_df = pd.DataFrame({'overall_count': dschrg_cd_df.groupby('year')['count'].sum(),
                            'invalid_count': invalid_dschrg_df.groupby('year')['count'].sum(),
                            'valid_count': dschrg_cd_df.groupby('year')['count'].sum() - invalid_dschrg_df.groupby('year')['count'].sum()})
dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'valid_count'] = dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'overall_count']
dschrg_comp_df['valid_count'] =  dschrg_comp_df['valid_count'].astype(int)
dschrg_comp_df['invalid_to_valid_percent'] = 100. * dschrg_comp_df['invalid_count'] / dschrg_comp_df['valid_count']
dschrg_comp_df 

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997,19,,19,
2000,22,,22,
2001,8,,8,
2002,89,,89,
2003,8,,8,
2004,82,,82,
2006,43,,43,
2007,53,,53,
2008,62,,62,
2009,68,,68,


## DRG CD

Checking DRG values in this column. There is no reference table to match the DRG values used in mcrnen so cannot validate DRG codes in data.

In [36]:
query = '''drop table if exists qa_reporting.dw_mcrn_drg_counts;
select year, drg_cd, count(*) as drg_count
into qa_reporting.dw_mcrn_drg_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [37]:
drg_df = pd.read_sql('select * from qa_reporting.dw_mcrn_drg_counts;', con=connection)
drg_df



Unnamed: 0,year,drg_cd,drg_count
0,2020,232,967
1,2015,565,3936
2,2017,079,796
3,2013,041,174
4,2014,038,10385
...,...,...,...
5888,2017,200,8650
5889,2015,352,2939
5890,2020,707,2018
5891,2020,290,19


In [38]:
drg_df['drg_cd'].unique()

array(['232', '565', '079', '041', '038', '758', '516', '152', '652',
       '204', '770', '260', '192', '385', '653', '951', '862', '069',
       '597', '224', '370', '313', '356', '514', '196', '013', '908',
       '577', '441', '467', '424', '221', '760', '236', '550', '616',
       '735', '032', '964', '756', '867', '339', '621', '582', '355',
       '722', '757', '442', '955', '382', '223', '843', '283', '927',
       '216', '062', '271', '584', '025', '076', '928', '445', '545',
       '220', '135', '948', '148', '380', '291', '580', '215', '896',
       '244', '723', '269', '052', '713', '881', '626', '394', '566',
       '314', '814', '205', '248', '839', '583', '064', '461', '319',
       '458', '395', '499', '554', '146', '012', '113', None, '688',
       '897', '654', '300', '054', '393', '344', '016', '000', '913',
       '379', '001', '629', '100', '437', '572', '315', '186', '151',
       '459', '774', '876', '286', '303', '637', '761', '150', '742',
       '136', '199', 

In [39]:
drg_df[drg_df['drg_cd'].isna()]

Unnamed: 0,year,drg_cd,drg_count
101,2007,,1
930,2008,,15
956,2002,,13
975,2000,,10
1319,2005,,1
1524,2011,,264
1585,2013,,1054902
1629,2010,,111
1872,2006,,12
2126,2001,,9


In [40]:
comp = pd.merge(left=drg_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'drg_cd', 'drg_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('drg_count')

Unnamed: 0,year,drg_cd,drg_count,dw_row_count,dw_uth_clm_id_count
5884,2005,,1,1,1
5849,2007,,1,54,4
5841,2010,881,2,205,40
2368,2013,951,2,1232368,120639
5867,2004,000,2,86,10
...,...,...,...,...,...
2137,2017,,176940187,186657208,59674210
3845,2019,,177099244,186366888,60543286
1153,2015,,177507678,187164080,58864318
5443,2018,,178178606,187692356,60197621


Many claims do not have DRG codes meaning that there will be a low percentage of drg to claim ratios. 

In [41]:
drg_sum_df = comp[~comp['drg_cd'].isna()].groupby(['year', 'dw_uth_clm_id_count'])['drg_count'].sum()
drg_sum_df = drg_sum_df.reset_index()
drg_sum_df['type_to_id'] = 1. * (drg_sum_df['drg_count'] / drg_sum_df['dw_uth_clm_id_count'])
drg_sum_df

Unnamed: 0,year,dw_uth_clm_id_count,drg_count,type_to_id
0,1997,3,19,6.333333
1,2000,2,12,6.0
2,2002,4,76,19.0
3,2003,3,8,2.666667
4,2004,10,71,7.1
5,2006,4,31,7.75
6,2007,4,53,13.25
7,2008,7,47,6.714286
8,2009,9,38,4.222222
9,2010,40,94,2.35


## Provider Type

In [42]:
query = '''drop table if exists qa_reporting.dw_mcrn_provider_type_counts;
select year, provider_type, count(*) as type_count
into qa_reporting.dw_mcrn_provider_type_counts
from dw_staging.mcrn_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [43]:
pt_df = pd.read_sql('select * from qa_reporting.dw_mcrn_provider_type_counts;', con=connection)
pt_df



Unnamed: 0,year,provider_type,type_count
0,2016,66,701832
1,2020,01,288026
2,2017,C9,455
3,2018,23,85936
4,2015,23,55715
...,...,...,...
716,2017,49,968281
717,2018,C7,46559
718,2007,,54
719,2016,16,508778


Most the provider specialties values in DW matches with the values Medicare uses.

In [44]:
query = '''
select a.*
from qa_reporting.dw_mcrn_provider_type_counts a
left join reference_tables.ref_provider_specialty b
on a.provider_type = b.provider_specialty_cd
where b.provider_specialty_cd is null
;
'''

invalid_pt_df = pd.read_sql(query, con=connection)
invalid_pt_df



Unnamed: 0,year,provider_type,type_count
0,2019,24,113752
1,2014,24,116637
2,2017,24,115336
3,2015,24,116692
4,2016,24,117739
...,...,...,...
716,2017,99,141193
717,2016,99,150401
718,2018,99,132685
719,2020,99,79181


In [45]:
comp = pd.merge(left=pt_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'provider_type', 'type_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('type_count')

Unnamed: 0,year,provider_type,type_count,dw_row_count,dw_uth_clm_id_count
700,2013,35,1,1232368,120639
703,2013,05,1,1232368,120639
716,2005,,1,1,1
687,2013,50,2,1232368,120639
441,2015,58,2,187164080,58864318
...,...,...,...,...,...
452,2015,,89385169,187164080,58864318
24,2016,,91177315,188258241,59668370
562,2019,,91625751,186366888,60543286
237,2017,,91744000,186657208,59674210


Provider type is on the claim line level. Ideally we should have a provider type for almost all the claims.

In [46]:
pt_sum_df = comp[~comp['provider_type'].isna()].groupby(['year', 'dw_row_count'])['type_count'].sum()
pt_sum_df = pt_sum_df.reset_index()
pt_sum_df['type_to_id'] = 1. * (pt_sum_df['type_count'] / pt_sum_df['dw_row_count'])
pt_sum_df

Unnamed: 0,year,dw_row_count,type_count,type_to_id
0,2013,1232368,723,0.000587
1,2014,183386940,96203108,0.524591
2,2015,187164080,97778911,0.522423
3,2016,188258241,97080926,0.51568
4,2017,186657208,94913208,0.508489
5,2018,187692356,95155413,0.506975
6,2019,186366888,94741137,0.508358
7,2020,161857564,81666436,0.504557
