# Data Warehouse Medicare Texas QA - Claim Detail

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [2]:
import pandas as pd
import sys
import psycopg2
from tqdm import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [3]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

## Row Count and Claim Count

In [3]:
query = ''' drop table if exists qa_reporting.dw_mcrt_claim_detail_counts;
create table qa_reporting.dw_mcrt_claim_detail_counts
(
    calendar_year int,
    table_src text,
    dw_row_count bigint,
    src_row_count bigint,
    row_count_diff bigint,
    row_count_diff_percentage float,
    dw_uth_clm_id_count bigint,
    dw_src_clm_id_count bigint,
    src_clm_count bigint,
    clm_count_diff bigint,
    clm_count_percentage float,
    dw_uth_mbr_id_count bigint,
    dw_src_mbr_id_count bigint,
    src_mbr_count bigint,
    mbr_count_diff bigint,
    mbr_count_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [4]:
with connection.cursor() as cursor:
    query = f'''
    insert into qa_reporting.dw_mcrt_claim_detail_counts
    (calendar_year, table_src, dw_row_count, date_generated)
    select year, 'ALL' table_id_src, count(*), current_date
    from dw_staging.mcrt_claim_detail
    group by 1
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_detail_counts b
    set dw_uth_clm_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_claim_id) as count 
        from dw_staging.mcrt_claim_detail
    group by 1 ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_detail_counts b
    set dw_src_clm_id_count = clm_count,
        dw_src_mbr_id_count = mbr_count
    from (
        select year, 'ALL' table_id_src,  count(distinct claim_id_src) as clm_count, count(distinct member_id_src) as mbr_count 
        from dw_staging.mcrt_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

    query = f'''
    update qa_reporting.dw_mcrt_claim_detail_counts b
    set dw_uth_mbr_id_count = count
    from (
        select year, 'ALL' table_id_src,  count(distinct uth_member_id) as count 
        from dw_staging.mcrt_claim_detail
        group by 1
    ) a
    where a.year = b.calendar_year
    and a.table_id_src = b.table_src
    '''

    cursor.execute(query)

In [None]:
with connection.cursor() as cursor:
    query = '''    
    with clms as (
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.hha_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.outpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.dme_line_k
        union all
        select extract(year from clm_thru_dt::date)as year, bene_id, clm_id
        from medicare_texas.inpatient_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.bcarrier_line_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.hospice_revenue_center_k
        union all
        select extract(year from clm_thru_dt::date) as year, bene_id, clm_id
        from medicare_texas.snf_revenue_center_k
    ),
    clm_counts as (
        select year, count(*) row_count, count(distinct bene_id) pat_count, count(distinct clm_id) clm_count
        from clms
        group by 1
    )
    update qa_reporting.dw_mcrt_claim_detail_counts a
    set src_row_count = b.row_count,
        row_count_diff = dw_row_count - b.row_count,
        row_count_diff_percentage = 100. * abs( dw_row_count - b.row_count) / b.row_count,
        src_clm_count = clm_count,
        clm_count_diff = dw_uth_clm_id_count - b.clm_count,
        clm_count_percentage = 100. * abs(dw_uth_clm_id_count - b.clm_count) / b.clm_count,
        src_mbr_count = b.pat_count,
        mbr_count_diff = dw_uth_mbr_id_count - b.pat_count,
        mbr_count_percentage = 100. * abs(dw_uth_mbr_id_count - b.pat_count) / b.pat_count
    from clm_counts b
    where a.calendar_year = b.year
    ;
    '''

    cursor.execute(query)

In [6]:
query = '''select * from qa_reporting.dw_mcrt_claim_detail_counts;'''

df = pd.read_sql(query, con=connection)
df.sort_values('calendar_year')



Unnamed: 0,calendar_year,table_src,dw_row_count,src_row_count,row_count_diff,row_count_diff_percentage,dw_uth_clm_id_count,dw_src_clm_id_count,src_clm_count,clm_count_diff,clm_count_percentage,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,date_generated
6,2001,ALL,104,,,,7,7,,,,7,7,,,,2023-12-18
2,2002,ALL,89,,,,8,8,,,,8,8,,,,2023-12-18
5,2003,ALL,3,,,,1,1,,,,1,1,,,,2023-12-18
18,2004,ALL,15,,,,3,3,,,,3,3,,,,2023-12-18
16,2005,ALL,60,,,,6,6,,,,5,5,,,,2023-12-18
10,2006,ALL,24,,,,3,3,,,,3,3,,,,2023-12-18
20,2007,ALL,70,,,,5,5,,,,5,5,,,,2023-12-18
3,2008,ALL,5,,,,2,2,,,,2,2,,,,2023-12-18
8,2009,ALL,60,,,,7,7,,,,7,7,,,,2023-12-18
21,2010,ALL,260,,,,23,23,,,,23,23,,,,2023-12-18


In [7]:
df.sort_values('row_count_diff_percentage', ascending=False)[['calendar_year', 'table_src', 'row_count_diff_percentage', 'clm_count_percentage', 'mbr_count_percentage']]

Unnamed: 0,calendar_year,table_src,row_count_diff_percentage,clm_count_percentage,mbr_count_percentage
4,2021,ALL,0.500685,0.177649,0.024655
1,2020,ALL,0.418876,0.0041,0.032768
14,2017,ALL,0.104583,0.024215,0.071291
9,2019,ALL,0.092197,0.01563,0.136967
0,2018,ALL,0.085096,0.036802,0.042233
17,2016,ALL,0.068969,0.014497,0.030696
19,2015,ALL,0.05837,0.013773,0.031413
7,2014,ALL,0.05145,0.040988,0.032249
2,2002,ALL,,,
3,2008,ALL,,,


## Place of Service

In [8]:
query = '''drop table if exists qa_reporting.dw_mcrt_pos;
select year, place_of_service, count(*)
into qa_reporting.dw_mcrt_pos
from dw_staging.mcrt_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [9]:
pos_df = pd.read_sql('select * from qa_reporting.dw_mcrt_pos;', con=connection)
pos_df



Unnamed: 0,year,place_of_service,count
0,2015,4,43
1,2019,55,1230
2,2020,15,30115
3,2002,1,87
4,2014,06,55
...,...,...,...
487,2015,60,1134362
488,2013,13,29
489,2020,19,1223869
490,2015,51,255455


In [10]:
pos_df['place_of_service'].unique()

array(['4', '55', '15', '1', '06', '20', '08', '81', '51', '2', '04',
       '99', '31', '53', '49', '19', '25', '22', '01', '26', '17', '3',
       '58', '60', '05', '72', '11', '07', '7', '34', '14', '18', '02',
       '09', '61', '57', '21', '87', '71', '03', '24', '56', '62', '65',
       '41', '00', '91', '13', '12', '42', '52', '23', '54', '8', '33',
       '16', '32', '50', '95', '40', '10'], dtype=object)

In [11]:
query = '''
select a.* 
from qa_reporting.dw_mcrt_pos a
left join reference_tables.ref_place_of_service b
on lpad(a.place_of_service, 2, '0') = b.place_of_treatment_cd
where b.place_of_treatment_cd is null
;
'''
pos_df = pd.read_sql(query, con=connection)
pos_df.sort_values(['year', 'place_of_service'])



Unnamed: 0,year,place_of_service,count
5,2014,0,108
3,2015,0,31
6,2016,0,51
2,2017,0,22
7,2018,0,27
0,2019,0,13
4,2020,0,24
1,2021,0,28


## Revenue Code

Looking at how many claims have invalid revenue codes.

In [12]:
query = '''drop table if exists qa_reporting.dw_mcrt_revenue_cd;
select year, revenue_cd, count(*)
into qa_reporting.dw_mcrt_revenue_cd
from dw_staging.mcrt_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [13]:
query = '''
select a.* 
from qa_reporting.dw_mcrt_revenue_cd a
left join reference_tables.ref_revenue_code b
on a.revenue_cd = b.revenue_cd
where b.revenue_cd is null
and a.revenue_cd is not null
;
'''
invalid_rev_cd_df = pd.read_sql(query, con=connection)
invalid_rev_cd_df.sort_values(['year', 'revenue_cd'])



Unnamed: 0,year,revenue_cd,count
8,2013,0004,1
50,2013,0451,5
14,2014,0004,1
22,2014,0006,1
213,2014,0015,1
...,...,...,...
110,2021,0875,1
102,2021,0891,45
34,2021,0948,14268
255,2021,8200,1


In [14]:
pd.DataFrame(invalid_rev_cd_df['revenue_cd'].unique())

Unnamed: 0,0
0,0591
1,0007
2,0354
3,1571
4,0626
...,...
159,9306
160,3251
161,0181
162,3199


In [15]:
invalid_rev_cd_sum = invalid_rev_cd_df.groupby('year').sum()
invalid_rev_cd_sum

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2013,6
2014,19570
2015,21409
2016,26930
2017,26965
2018,28394
2019,28298
2020,12536
2021,14910


In [16]:
rev_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrt_revenue_cd group by 1', con=connection)
rev_cd_df['sum'] = rev_cd_df['sum'].astype(int)
rev_cd_df = rev_cd_df.set_index('year')
rev_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,266343788
2020,241930203
2002,89
2008,5
2021,241361762
2003,3
2001,104
2014,273755912
2009,60
2019,263546074


We should have a very low percentage of claim lines that have revenue codes with invalid revenue codes. If the percent if > 1%, need to investigate further

In [17]:
100. * invalid_rev_cd_sum['count'] / rev_cd_df['sum']

year
2001         NaN
2002         NaN
2003         NaN
2004         NaN
2005         NaN
2006         NaN
2007         NaN
2008         NaN
2009         NaN
2010         NaN
2011         NaN
2012         NaN
2013    0.000172
2014    0.007149
2015    0.007750
2016    0.009739
2017    0.009801
2018    0.010661
2019    0.010737
2020    0.005182
2021    0.006177
2022         NaN
dtype: float64

## Bill Type Code

Looking at how many claims have invalid bill type codes. This occurs when the claim has a 2 character bill type code in the raw data.

In [18]:
query = '''drop table if exists qa_reporting.dw_mcrt_bill_cd;
select year, bill_type_inst || bill_type_class || bill_type_freq as bill_cd, count(*)
into qa_reporting.dw_mcrt_bill_cd
from dw_staging.mcrt_claim_detail
group by 1,2
;'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [19]:
query = '''
select a.* 
from qa_reporting.dw_mcrt_bill_cd a
left join reference_tables.ref_bill_type_cd b
on a.bill_cd = b.bill_type_cd
where b.bill_type_cd is null
and a.bill_cd is not null
;
'''
invalid_bill_cd_df = pd.read_sql(query, con=connection)
invalid_bill_cd_df.sort_values(['year', 'bill_cd'])



Unnamed: 0,year,bill_cd,count


In [20]:
pd.DataFrame(invalid_bill_cd_df['bill_cd'].unique())

Unnamed: 0,0


In [21]:
invalid_bill_cd_sum = invalid_bill_cd_df.groupby('year').sum()
invalid_bill_cd_sum

In [22]:
bill_cd_df = pd.read_sql('select year, sum(count) from qa_reporting.dw_mcrt_bill_cd group by 1', con=connection)
bill_cd_df['sum'] = bill_cd_df['sum'].astype(int)
bill_cd_df = bill_cd_df.set_index('year')
bill_cd_df



Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2018,266343788
2020,241930203
2002,89
2008,5
2021,241361762
2003,3
2001,104
2014,273755912
2009,60
2019,263546074


We should have a very low percentage of claim lines that have bill type codes with invalid bill type codes. If the percent if > 1%, need to investigate further

In [23]:
100. * (invalid_bill_cd_sum['count'] if invalid_bill_cd_sum.shape[0] > 0  else 0) / bill_cd_df['sum']

year
2018    0.0
2020    0.0
2002    0.0
2008    0.0
2021    0.0
2003    0.0
2001    0.0
2014    0.0
2009    0.0
2019    0.0
2006    0.0
2011    0.0
2012    0.0
2022    0.0
2013    0.0
2005    0.0
2016    0.0
2004    0.0
2015    0.0
2007    0.0
2010    0.0
2017    0.0
Name: sum, dtype: float64

## CPT HCPCS Code

In [24]:
query = '''drop table if exists qa_reporting.dw_mcrt_cpt_proc_counts;
select year, cpt_hcpcs_cd, count(*) as proc_count
into qa_reporting.dw_mcrt_cpt_proc_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [25]:
cpt_proc_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrt_cpt_proc_counts;', con=connection)
cpt_proc_cd_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2021,A0604,1079
1,2018,33366,23
2,2021,G9675,1
3,2014,1AHM3,7
4,2015,65272,5
...,...,...,...
149008,2017,01522,25
149009,2019,RUC32,4
149010,2017,0191T,5875
149011,2014,29891,110


Currently it may be hard to determine how many invalid CPT and HCPCS there in the data due to the list of codes we have for as a reference. There are two reference tables that can be used, **reference_tables.mrconso_cpt_hcpcs_hcpt** and **reference_tables.cpt_hcpc**

In [26]:
query = '''
select a.*
from qa_reporting.dw_mcrt_cpt_proc_counts a
left join (select distinct code from reference_tables.mrconso_cpt_hcpcs_hcpt) b
on a.cpt_hcpcs_cd = b.code
where b.code is null
;
'''

invalid_proc_df = pd.read_sql(query, con=connection)
invalid_proc_df



Unnamed: 0,year,cpt_hcpcs_cd,proc_count
0,2020,GHNE1,11
1,2016,G9666,84
2,2015,27704,32
3,2017,3AHPS,303
4,2017,HC211,11
...,...,...,...
51148,2016,0302T,1
51149,2014,G9200,2379
51150,2019,ODKA0,2
51151,2015,ES204,98


In [27]:
invalid_proc_df.groupby('year')['proc_count'].sum()

year
2001         100
2002          87
2003           3
2004          13
2005          58
2006          23
2007          70
2008           4
2009          60
2010         255
2011         348
2012         826
2013     2090005
2014    66646607
2015    67119244
2016    55480691
2017    50185692
2018    43598629
2019    38607509
2020    34929782
2021    34262933
2022        2512
Name: proc_count, dtype: int64

In [28]:
proc_comp_df = pd.DataFrame({'overall_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum(),
                            'invalid_proc_count': invalid_proc_df.groupby('year')['proc_count'].sum(),
                            'valid_proc_count': cpt_proc_cd_df.groupby('year')['proc_count'].sum() - invalid_proc_df.groupby('year')['proc_count'].sum()})
proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'valid_proc_count'] = proc_comp_df.loc[proc_comp_df['valid_proc_count'].isna(),'overall_proc_count']
proc_comp_df['valid_proc_count'] =  proc_comp_df['valid_proc_count'].astype(int)
proc_comp_df['invalid_to_valid_percent'] = 100. * proc_comp_df['invalid_proc_count'] / proc_comp_df['valid_proc_count']
proc_comp_df  

Unnamed: 0_level_0,overall_proc_count,invalid_proc_count,valid_proc_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,104,100,4,2500.0
2002,89,87,2,4350.0
2003,3,3,0,inf
2004,15,13,2,650.0
2005,60,58,2,2900.0
2006,24,23,1,2300.0
2007,70,70,0,inf
2008,5,4,1,400.0
2009,60,60,0,inf
2010,260,255,5,5100.0


## Discharge Status

Checking if there are invalid discharge status codes.

Ignore lines with a NULL discharge status code. Most lines have no discharge status possibly due to raw data not providing this or other reasons.

In [29]:
query = '''drop table if exists qa_reporting.dw_mcrt_discharge_counts;
select year, discharge_status, count(*) as count
into qa_reporting.dw_mcrt_discharge_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [30]:
dschrg_cd_df = pd.read_sql('select * from qa_reporting.dw_mcrt_discharge_counts;', con=connection)
dschrg_cd_df



Unnamed: 0,year,discharge_status,count
0,2015,30,23929220
1,2016,43,7721
2,2019,62,1030200
3,2020,65,28704
4,2014,87,63
...,...,...,...
427,2015,83,12419
428,2020,63,314540
429,2019,50,449528
430,2006,30,3


In [31]:
dschrg_cd_df['discharge_status'].unique()

array(['30', '43', '62', '65', '87', '63', '01', '69', '91', '06', '61',
       '50', '84', '64', '02', None, '05', '93', '85', '20', '89', '86',
       '72', '83', '94', '90', '95', '51', '07', '92', '41', '81', '21',
       '71', '66', '03', '82', '88', '40', '08', '70', '00', '04', '42'],
      dtype=object)

In [32]:
dschrg_cd_df[dschrg_cd_df['discharge_status'].isna()]

Unnamed: 0,year,discharge_status,count
22,2011,,1
44,2001,,1
59,2014,,227976322
78,2019,,222552120
95,2021,,206206101
101,2002,,2
108,2006,,1
125,2005,,2
129,2017,,230213536
146,2015,,230072319


In [33]:
dschrg_cd_df = dschrg_cd_df[~dschrg_cd_df['discharge_status'].isna()]
dschrg_cd_df

Unnamed: 0,year,discharge_status,count
0,2015,30,23929220
1,2016,43,7721
2,2019,62,1030200
3,2020,65,28704
4,2014,87,63
...,...,...,...
427,2015,83,12419
428,2020,63,314540
429,2019,50,449528
430,2006,30,3


In [34]:
query = '''
select a.*
from qa_reporting.dw_mcrt_discharge_counts a
left join reference_tables.ref_discharge_status b
on a.discharge_status = b.discharge_status
where b.discharge_status is null
and a.discharge_status is not null
;
'''

invalid_dschrg_df = pd.read_sql(query, con=connection)
invalid_dschrg_df



Unnamed: 0,year,discharge_status,count
0,2019,0,2669
1,2015,0,3199
2,2017,0,4428
3,2014,0,3046
4,2020,0,206
5,2013,0,437
6,2018,0,2392
7,2016,0,3206


In [35]:
dschrg_comp_df = pd.DataFrame({'overall_count': dschrg_cd_df.groupby('year')['count'].sum(),
                            'invalid_count': invalid_dschrg_df.groupby('year')['count'].sum(),
                            'valid_count': dschrg_cd_df.groupby('year')['count'].sum() - invalid_dschrg_df.groupby('year')['count'].sum()})
dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'valid_count'] = dschrg_comp_df.loc[dschrg_comp_df['valid_count'].isna(),'overall_count']
dschrg_comp_df['valid_count'] =  dschrg_comp_df['valid_count'].astype(int)
dschrg_comp_df['invalid_to_valid_percent'] = 100. * dschrg_comp_df['invalid_count'] / dschrg_comp_df['valid_count']
dschrg_comp_df 

Unnamed: 0_level_0,overall_count,invalid_count,valid_count,invalid_to_valid_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,103,,103,
2002,87,,87,
2003,3,,3,
2004,13,,13,
2005,58,,58,
2006,23,,23,
2007,70,,70,
2008,4,,4,
2009,60,,60,
2010,256,,256,


## DRG CD

Checking DRG values in this column. There is no reference table to match the DRG values used in mcrten so cannot validate DRG codes in data.

In [36]:
query = '''drop table if exists qa_reporting.dw_mcrt_drg_counts;
select year, drg_cd, count(*) as drg_count
into qa_reporting.dw_mcrt_drg_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [37]:
drg_df = pd.read_sql('select * from qa_reporting.dw_mcrt_drg_counts;', con=connection)
drg_df



Unnamed: 0,year,drg_cd,drg_count
0,2020,135,404
1,2012,871,25
2,2013,289,84
3,2013,473,110
4,2021,737,2748
...,...,...,...
6704,2014,765,5693
6705,2018,823,6338
6706,2020,565,7906
6707,2021,243,27295


In [38]:
drg_df['drg_cd'].unique()

array(['135', '871', '289', '473', '737', '566', '666', '853', '724',
       '240', '842', '139', '461', '698', '823', '116', '320', '086',
       '083', '845', '869', '515', '464', '453', '492', '003', '548',
       '746', '712', '163', '455', '434', '513', '458', '301', '242',
       '570', '624', '645', '541', '168', '264', '656', '535', '042',
       '020', '287', '384', '114', '658', '919', '374', '941', '460',
       '190', '471', '099', '696', '138', '064', '314', '864', '655',
       '975', '915', '958', '715', '654', '618', '736', '595', '723',
       '638', '906', '511', '822', '675', '200', '077', '799', '092',
       '734', '445', '373', '695', '296', '387', '914', '563', '306',
       '560', '780', '683', '593', '183', '286', '053', None, '928',
       '152', '382', '343', '957', '095', '854', '007', '663', '637',
       '201', '037', '008', '627', '841', '987', '271', '215', '716',
       '237', '862', '948', '356', '498', '390', '010', '005', '248',
       '071', '884', 

In [39]:
drg_df[drg_df['drg_cd'].isna()]

Unnamed: 0,year,drg_cd,drg_count
106,2011,,170
157,2021,,230102460
186,2018,,252292549
285,2008,,5
558,2003,,3
909,2005,,11
1116,2004,,2
1218,2012,,477
1247,2013,,3188946
1393,2015,,261678020


In [40]:
comp = pd.merge(left=drg_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'drg_cd', 'drg_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('drg_count')

Unnamed: 0,year,drg_cd,drg_count,dw_row_count,dw_uth_clm_id_count
6695,2001,,1,104,7
6666,2010,958,2,260,23
6672,2011,885,2,349,42
6665,2010,083,2,260,23
1357,2013,887,2,3479005,253129
...,...,...,...,...,...
2900,2018,,252292549,266343788,78096491
2612,2014,,259133232,273755912,78590801
5138,2017,,260365766,275137582,80780365
3796,2015,,261678020,276242995,79304975


Many claims do not have DRG codes meaning that there will be a low percentage of drg to claim ratios. 

In [41]:
drg_sum_df = comp[~comp['drg_cd'].isna()].groupby(['year', 'dw_uth_clm_id_count'])['drg_count'].sum()
drg_sum_df = drg_sum_df.reset_index()
drg_sum_df['type_to_id'] = 1. * (drg_sum_df['drg_count'] / drg_sum_df['dw_uth_clm_id_count'])
drg_sum_df

Unnamed: 0,year,dw_uth_clm_id_count,drg_count,type_to_id
0,2001,7,103,14.714286
1,2002,8,79,9.875
2,2004,3,13,4.333333
3,2005,6,49,8.166667
4,2006,3,20,6.666667
5,2007,5,60,12.0
6,2009,7,45,6.428571
7,2010,23,158,6.869565
8,2011,42,179,4.261905
9,2012,126,359,2.849206


## Provider Type

In [42]:
query = '''drop table if exists qa_reporting.dw_mcrt_provider_type_counts;
select year, provider_type, count(*) as type_count
into qa_reporting.dw_mcrt_provider_type_counts
from dw_staging.mcrt_claim_detail
group by 1,2;
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [43]:
pt_df = pd.read_sql('select * from qa_reporting.dw_mcrt_provider_type_counts;', con=connection)
pt_df



Unnamed: 0,year,provider_type,type_count
0,2014,84,33544
1,2020,C7,45419
2,2019,67,133448
3,2016,48,2015595
4,2020,49,764989
...,...,...,...
811,2015,35,2277634
812,2019,35,1972702
813,2021,70,5167
814,2017,C8,4


Most the provider specialties values in DW matches with the values Medicare uses.

In [44]:
query = '''
select a.*
from qa_reporting.dw_mcrt_provider_type_counts a
left join reference_tables.ref_provider_specialty b
on a.provider_type = b.provider_specialty_cd
where b.provider_specialty_cd is null
;
'''

invalid_pt_df = pd.read_sql(query, con=connection)
invalid_pt_df



Unnamed: 0,year,provider_type,type_count
0,2020,69,22152493
1,2021,69,22726917
2,2015,69,26494025
3,2016,69,23336253
4,2014,69,26747347
...,...,...,...
811,2016,79,2154
812,2019,33,99996
813,2015,79,3271
814,2014,33,102881


In [45]:
comp = pd.merge(left=pt_df, right=df.loc[df['table_src'] == 'ALL', ['calendar_year', 'table_src', 'dw_row_count', 'dw_uth_clm_id_count']],
                left_on='year', right_on='calendar_year', how='outer')
comp = comp[['year', 'provider_type', 'type_count', 'dw_row_count', 'dw_uth_clm_id_count']]
comp.sort_values('type_count')

Unnamed: 0,year,provider_type,type_count,dw_row_count,dw_uth_clm_id_count
321,2016,C5,1,276518957,80516460
798,2013,30,1,3479005,253129
45,2014,54,1,273755912,78590801
791,2013,24,1,3479005,253129
49,2014,52,1,273755912,78590801
...,...,...,...,...,...
721,2018,,134902353,266343788,78096491
37,2014,,135649388,273755912,78590801
394,2015,,137790115,276242995,79304975
613,2017,,139176274,275137582,80780365


Provider type is on the claim line level. Ideally we should have a provider type for almost all the claims.

In [46]:
pt_sum_df = comp[~comp['provider_type'].isna()].groupby(['year', 'dw_row_count'])['type_count'].sum()
pt_sum_df = pt_sum_df.reset_index()
pt_sum_df['type_to_id'] = 1. * (pt_sum_df['type_count'] / pt_sum_df['dw_row_count'])
pt_sum_df

Unnamed: 0,year,dw_row_count,type_count,type_to_id
0,2013,3479005,109,3.1e-05
1,2014,273755912,138106524,0.504488
2,2015,276242995,138452880,0.5012
3,2016,276518957,136536233,0.493768
4,2017,275137582,135961308,0.494158
5,2018,266343788,131441435,0.493503
6,2019,263546074,131165092,0.497693
7,2020,241930203,120237488,0.496992
8,2021,241361762,125292030,0.519105
