# Data Warehouse Truven QA - Member Enrollment Yearly

Performing QA on tables in dw_staging before moving them to data_warehouse schema

## Initialization

Just loading packages that will be used and initializing connection to GP DB.

In [1]:
import pandas as pd
import sys
import psycopg2
from tqdm.notebook import tqdm
sys.path.append('H:/uth_helpers')
from db_utils import get_dsn

In [2]:
connection = psycopg2.connect(get_dsn()+' keepalives=1 keepalives_idle=30 keepalives_interval=10')
connection.autocommit = True

In [3]:
# connection.close()

## Member Counts and Member Months

This table is an aggregate of the member_enrollment_monthly table where enrollment data is aggregated to a yearly level.

Because of this, we cannot compare the row count of the raw tables with this table since enrollment tables from raw data sources are usually not on a yearly level. Instead, we can try to compare the Member Months (MM) and other columns to see if the aggregrate logic implemented is working as intended

In [26]:
query = '''drop table if exists qa_reporting.dw_truv_mbr_enrl_yearly;
create table qa_reporting.dw_truv_mbr_enrl_yearly
(
    data_source text,
    calendar_year int,
    table_src text,
    dw_row_count int,
    dw_uth_mbr_id_count int,
    dw_src_mbr_id_count int,
    src_mbr_count int,
    mbr_count_diff int,
    mbr_count_percentage float,
    dw_mm int,
    src_mm int,
    mm_diff int,
    mm_diff_percentage float,
    date_generated date
);
'''

with connection.cursor() as cursor:
    cursor.execute(query)

In [27]:
with connection.cursor() as cursor:
      query = '''
insert into qa_reporting.dw_truv_mbr_enrl_yearly
(data_source, calendar_year, table_src, dw_row_count, dw_uth_mbr_id_count, dw_src_mbr_id_count, dw_mm,date_generated)
select data_source, 
        year, 
        table_id_src, 
        count(*),
        count(distinct uth_member_id),
        count(distinct member_id_src),
        sum(total_enrolled_months),
        now()::date
  from dw_staging.truc_member_enrollment_yearly
 group by 1,2,3;
      '''

      cursor.execute(query)

      query = '''
insert into qa_reporting.dw_truv_mbr_enrl_yearly
(data_source, calendar_year, table_src, dw_row_count, dw_uth_mbr_id_count, dw_src_mbr_id_count, dw_mm,date_generated)
select data_source, 
        year, 
        table_id_src, 
        count(*),
        count(distinct uth_member_id),
        count(distinct member_id_src),
        sum(total_enrolled_months),
        now()::date
  from dw_staging.trum_member_enrollment_yearly
 group by 1,2,3;
      '''

      cursor.execute(query)


In [28]:
with connection.cursor() as cursor:
    
      query = '''
update qa_reporting.dw_truv_mbr_enrl_yearly a
set src_mbr_count = b.pat_count,
    mbr_count_diff = a.dw_src_mbr_id_count - b.pat_count,
    mbr_count_percentage = abs(a.dw_src_mbr_id_count - b.pat_count) / b.pat_count
from qa_reporting.truven_counts b
where a.calendar_year = b.year
and a.table_src = b.table_name
;
      '''

      cursor.execute(query)

      query = '''
with enrl_months as(
select 'ccaet' as table_name, year, enrolid, extract(month from dtstart) as enrolled_month
from truven.ccaet
union
select 'mdcrt' as table_name, year, enrolid, extract(month from dtstart) as enrolled_month
from truven.mdcrt
),
enrl_my as (
    select table_name, year, count(enrolled_month) as mm
    from enrl_months
    group by 1,2
)
update qa_reporting.dw_truv_mbr_enrl_yearly a
set src_mm = b.mm,
    mm_diff = a.dw_mm - b.mm,
    mm_diff_percentage = abs(a.dw_mm - b.mm) / b.mm
from enrl_my b
where a.calendar_year = b.year
and a.table_src = b.table_name
;
      '''

      cursor.execute(query)

In [30]:
df = pd.read_sql('''select * 
from qa_reporting.dw_truv_mbr_enrl_yearly 
order by mm_diff ;''', con=connection)
df



Unnamed: 0,data_source,calendar_year,table_src,dw_row_count,dw_uth_mbr_id_count,dw_src_mbr_id_count,src_mbr_count,mbr_count_diff,mbr_count_percentage,dw_mm,src_mm,mm_diff,mm_diff_percentage,date_generated
0,truc,2022,ccaet,20801067,20801067,20801067,20801067,0,0.0,162833833,162833833,0,0.0,2023-07-19
1,truc,2011,ccaet,55559154,55559154,55559154,55559154,0,0.0,564687803,564687803,0,0.0,2023-07-19
2,truc,2018,ccaet,27087740,27087740,27087740,27087740,0,0.0,275947642,275947642,0,0.0,2023-07-19
3,truc,2017,ccaet,26146275,26146275,26146275,26146275,0,0.0,271103078,271103078,0,0.0,2023-07-19
4,truc,2012,ccaet,55975628,55975628,55975628,55975628,0,0.0,567017994,567017994,0,0.0,2023-07-19
5,truc,2016,ccaet,28717738,28717738,28717738,28717738,0,0.0,296640184,296640184,0,0.0,2023-07-19
6,truc,2019,ccaet,25388778,25388778,25388778,25388778,0,0.0,257849054,257849054,0,0.0,2023-07-19
7,truc,2014,ccaet,47258528,47258528,47258528,47258528,0,0.0,475186131,475186131,0,0.0,2023-07-19
8,truc,2013,ccaet,43737217,43737217,43737217,43737217,0,0.0,442452630,442452630,0,0.0,2023-07-19
9,truc,2020,ccaet,23306734,23306734,23306734,23306734,0,0.0,241254229,241254229,0,0.0,2023-07-19


Here we look at the overall difference of member enrollment and member count. We start of by just adding the counts for each of the source tables. However, due to how the yearly table is built, it may be better to look at member counts and member months as the data source overall.

In [7]:
df.groupby('calendar_year')[['mm_diff', 'mm_diff_percentage', 'mbr_count_diff', 'mbr_count_percentage']].sum()

Unnamed: 0_level_0,mm_diff,mm_diff_percentage,mbr_count_diff,mbr_count_percentage
calendar_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,0.0,0.0,0,0.0
2012,0.0,0.0,0,0.0
2013,0.0,0.0,0,0.0
2014,0.0,0.0,0,0.0
2015,0.0,0.0,0,0.0
2016,0.0,0.0,0,0.0
2017,0.0,0.0,0,0.0
2018,0.0,0.0,0,0.0
2019,0.0,0.0,0,0.0
2020,0.0,0.0,0,0.0


Now we will look at the overall member count and member months rather than splitting them up based on the source table.

In [8]:
query = '''
with truv_enrl as (
    -- get unique members and their monthly enrollment to any program per calendar year
    select 'truc' as data_source, year, enrolid, extract(month from dtstart) as enrolled_month
    from truven.ccaet
    union
    select 'trum' as data_source,year, enrolid, extract(month from dtstart) as enrolled_month
    from truven.mdcrt
),
truv_enrl_counts as (
    select data_source, year, count(distinct enrolid) src_mbr_count, count(enrolled_month) src_mm
    from truv_enrl
    group by 1,2
),
dw_truv_enrl as (
    select data_source, year, count(uth_member_id) as dw_mbr_count, sum(total_enrolled_months) as dw_mm
    from dw_staging.truc_member_enrollment_yearly
    group by 1,2
    union
    select data_source, year, count(uth_member_id) as dw_mbr_count, sum(total_enrolled_months) as dw_mm
    from dw_staging.trum_member_enrollment_yearly
    group by 1,2
)
select a.data_source, a.year, a.dw_mbr_count, b.src_mbr_count, 
        a.dw_mbr_count - b.src_mbr_count as mbr_diff,
        a.dw_mm, b.src_mm, a.dw_mm - b.src_mm as mm_diff
from dw_truv_enrl a
join truv_enrl_counts b
on a.year = b.year
and a.data_source = b.data_source;'''

pd.read_sql(query, con=connection)



Unnamed: 0,data_source,year,dw_mbr_count,src_mbr_count,mbr_diff,dw_mm,src_mm,mm_diff
0,truc,2015,28348363,28348363,0,,289793751,
1,trum,2011,5243029,5243029,0,56639459.0,56639459,0.0
2,truc,2017,26146275,26146275,0,,271103078,
3,truc,2018,27087740,27087740,0,,275947642,
4,trum,2019,1632440,1632440,0,18237778.0,18237778,0.0
5,trum,2013,4271755,4271755,0,45238676.0,45238676,0.0
6,trum,2022,1593564,1593564,0,13405957.0,13405957,0.0
7,trum,2012,4874717,4874717,0,51474029.0,51474029,0.0
8,trum,2017,1473787,1473787,0,16283781.0,16283781,0.0
9,truc,2019,25388778,25388778,0,,257849054,


## Gender Counts

Just like with the monthly enrollment table, we need to check the counts and values of the other columns. Here we take a look at the gender_cd column

In [14]:
query = '''with truven_gen_cd as (
    select 'truc' as data_source, year, enrolid, sex
    from truven.ccaet
    union
    select 'trum' as data_source, year, enrolid, sex
    from truven.mdcrt
),
truven_gen as (
    select data_source, year, c.gender_cd, count(*) gender_count
    from truven_gen_cd m
    left outer join reference_tables.ref_gender c
    on c.data_source = 'trv'
    and c.gender_cd_src = m.sex::text
    group by 1,2,3
), dw_gen as (
    select data_source, year, gender_cd, count(*) gender_count
    from dw_staging.trum_member_enrollment_yearly
    group by 1,2
    union
    select data_source, year, gender_cd, count(*) gender_count
    from dw_staging.trum_member_enrollment_yearly
    group by 1,2
)
select a.*, b.gender_count as src_gender_count, 
        a.gender_count - b.gender_count as gender_diff, 
        100. * abs(a.gender_count - b.gender_count) / b.gender_count as gender_diff_percent
from truven_gen b
full outer join dw_gen a
on a.year = b.year
and a.gender_cd = b.gender_cd
order by year, gender_cd;
'''

pd.read_sql(query,  con=connection)



Unnamed: 0,year,gender_cd,gender_count,src_gender_count,gender_diff,gender_diff_percent
0,2011,F,31157196,31157196,0,0.0
1,2011,M,29304232,29304232,0,0.0
2,2012,F,31152383,31152383,0,0.0
3,2012,M,29363662,29363662,0,0.0
4,2013,F,24607955,24607955,0,0.0
5,2013,M,23118079,23118079,0,0.0
6,2014,F,26186789,26186789,0,0.0
7,2014,M,24678391,24678391,0,0.0
8,2015,F,15814141,15814141,0,0.0
9,2015,M,14604398,14604398,0,0.0


## Plan Type

Check if plan types are properly mapped at a yearly level. If the plan type counts do not match between the member_enrollment_yearly table and the raw enrollment table, it most likely occurred when cleaning up the plan_type column. Ideally the count difference should be <= 0.

In [9]:
query = '''with truven_enroll as (
    select 'truc' as data_source, year, enrolid, plantyp
    from truven.ccaet
    union
    select 'trum' as data_source, year, enrolid, plantyp
    from truven.mdcrt
),
truven_plans as (
    select m.data_source, year, d.plan_type, count(*) plan_count
    from truven_enroll m
    left outer join reference_tables.ref_plan_type d
    on d.data_source = 'trv'
  and d.plan_type_src::int = m.plantyp 
    group by 1,2,3
), dw_plans as (
    select data_source, year, plan_type, count(*) plan_count
    from dw_staging.truc_member_enrollment_yearly
    group by 1,2,3
    union
    select data_source, year, plan_type, count(*) plan_count
    from dw_staging.trum_member_enrollment_yearly
    group by 1,2,3
)
select a.*, b.plan_count as src_plan_count, 
        a.plan_count - b.plan_count as plan_diff, 
        100. * abs(a.plan_count - b.plan_count) / b.plan_count as plan_diff_percent
from truven_plans b
join dw_plans a
on a.year = b.year
and a.plan_type = b.plan_type
and a.data_source = b.data_source
order by year;
'''

plan_type_counts_df = pd.read_sql(query,  con=connection)
plan_type_counts_df



Unnamed: 0,data_source,year,plan_type,plan_count,src_plan_count,plan_diff,plan_diff_percent
0,truc,2011,CDHP,2133091,2142622,-9531,0.444829
1,truc,2011,HDHP,1743239,1785936,-42697,2.390735
2,truc,2011,CMP,760764,797369,-36605,4.590723
3,trum,2011,POS,146905,147809,-904,0.611600
4,truc,2011,HMO,6384573,6465608,-81035,1.253324
...,...,...,...,...,...,...,...
168,trum,2022,CMP,193809,197447,-3638,1.842520
169,trum,2022,CDHP,6161,6195,-34,0.548830
170,trum,2022,POS,8181,8280,-99,1.195652
171,truc,2022,POS,2386270,2394286,-8016,0.334797


In [13]:
plan_type_counts_df.groupby('year')['plan_diff_percent'].min(), plan_type_counts_df.groupby('year')['plan_diff_percent'].max()

(year
 2011    0.094561
 2012    0.126337
 2013    0.226725
 2014    0.307192
 2015    0.155604
 2016    0.129447
 2017    0.100513
 2018    0.059592
 2019    0.000000
 2020    0.000000
 2021    0.188186
 2022    0.128008
 Name: plan_diff_percent, dtype: float64,
 year
 2011    4.590723
 2012    4.161845
 2013    3.114153
 2014    3.727955
 2015    3.089090
 2016    2.990624
 2017    1.918633
 2018    2.014752
 2019    2.932677
 2020    8.132530
 2021    4.150608
 2022    4.882108
 Name: plan_diff_percent, dtype: float64)

In [14]:
plan_type_counts_df.sort_values('plan_diff_percent')

Unnamed: 0,data_source,year,plan_type,plan_count,src_plan_count,plan_diff,plan_diff_percent
114,trum,2019,BMM,9,9,0,0.000000
132,trum,2020,EPO,1143,1143,0,0.000000
118,trum,2019,POS,36356,36367,-11,0.030247
134,trum,2020,POS,8597,8602,-5,0.058126
104,trum,2018,POS,46958,46986,-28,0.059592
...,...,...,...,...,...,...,...
153,truc,2021,BMM,3233,3373,-140,4.150608
26,truc,2012,EPO,1065980,1112271,-46291,4.161845
2,truc,2011,CMP,760764,797369,-36605,4.590723
161,truc,2022,BMM,3429,3605,-176,4.882108


In [15]:
plan_types_agg_df = plan_type_counts_df.groupby('year')['plan_count', 'src_plan_count', 'plan_diff'].sum()
plan_types_agg_df

  plan_types_agg_df = plan_type_counts_df.groupby('year')['plan_count', 'src_plan_count', 'plan_diff'].sum()


Unnamed: 0_level_0,plan_count,src_plan_count,plan_diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011,56234058,56639923,-405865
2012,56365447,56719207,-353760
2013,47205689,47536034,-330345
2014,47291917,47769682,-477765
2015,29781940,30044207,-262267
2016,30361760,30649097,-287337
2017,27144179,27266303,-122124
2018,27496336,27618006,-121670
2019,26571547,26719852,-148305
2020,24453571,24628708,-175137


There are members that do not have a plantyp in the raw data for a given month. Let's see if the number of null plan types decrease at a yearly level

In [16]:
query = '''with truven_enroll as (
    select 'truc' as data_source, year, enrolid, plantyp
    from truven.ccaet
    where plantyp is null
    union
    select 'trum' as data_source, year, enrolid, plantyp
    from truven.mdcrt
    where plantyp is null
),
truven_plans as (
    select data_source, year, count(*) plan_count
    from truven_enroll
    group by 1,2
), dw_plans as (
    select data_source, year, count(*) plan_count
    from dw_staging.trum_member_enrollment_yearly
    where plan_type is null
    group by 1,2
    union
    select data_source, year, count(*) plan_count
    from dw_staging.truc_member_enrollment_yearly
    where plan_type is null
    group by 1,2
)
select a.*, b.plan_count as src_plan_count, 
        a.plan_count - b.plan_count as plan_diff, 
        100. * abs(a.plan_count - b.plan_count) / b.plan_count as plan_diff_percent
from truven_plans b
join dw_plans a
on a.year = b.year
and a.data_source = b.data_source
order by year;
'''

df = pd.read_sql(query,  con=connection)
df



Unnamed: 0,data_source,year,plan_count,src_plan_count,plan_diff,plan_diff_percent
0,trum,2011,230670,232211,-1541,0.663621
1,truc,2011,4337455,4414774,-77319,1.751369
2,trum,2012,199717,201468,-1751,0.869121
3,truc,2012,4285181,4320736,-35555,0.822892
4,truc,2013,728104,740882,-12778,1.724701
5,trum,2013,75179,75534,-355,0.469987
6,truc,2014,3787841,3826044,-38203,0.998499
7,trum,2014,47600,47911,-311,0.64912
8,trum,2015,23451,23723,-272,1.146567
9,truc,2015,742605,830666,-88061,10.601252


## BUS_CD

In [17]:
query = '''with truven_enroll as (
    select 'truc' as data_source,  year, enrolid, case when medadv = '1' then 'MA' else 'COM' end as bus_cd
    from truven.ccaet
    union
    select 'trum' as data_source, year, enrolid, case when medadv = '1' then 'MA' else 'MS' end as bus_cd
    from truven.mdcrt
),
truven_plans as (
    select data_source, year, bus_cd, count(*) plan_count
    from truven_enroll
    group by 1,2,3
), dw_plans as (
    select data_source, year, bus_cd, count(*) plan_count
    from dw_staging.truc_member_enrollment_yearly
    group by 1,2,3
    union
    select data_source, year, bus_cd, count(*) plan_count
    from dw_staging.trum_member_enrollment_yearly
    group by 1,2,3
)
select a.*, b.plan_count as src_plan_count, 
        a.plan_count - b.plan_count as plan_diff, 
        100. * abs(a.plan_count - b.plan_count) / b.plan_count as plan_diff_percent
from truven_plans b
full join dw_plans a
on a.year = b.year
and a.bus_cd = b.bus_cd
and a.data_source = b.data_source
order by year;
'''

df = pd.read_sql(query,  con=connection)
df



Unnamed: 0,data_source,year,bus_cd,plan_count,src_plan_count,plan_diff,plan_diff_percent
0,trum,2011,MS,5243029,5243029,0,0.0
1,truc,2011,COM,55559154,55559154,0,0.0
2,truc,2012,COM,55975628,55975628,0,0.0
3,trum,2012,MS,4874717,4874717,0,0.0
4,truc,2013,COM,43737217,43737217,0,0.0
5,trum,2013,MS,4271755,4271755,0,0.0
6,trum,2014,MS,3868830,3868830,0,0.0
7,truc,2014,COM,47258528,47258528,0,0.0
8,trum,2015,MS,2199633,2199633,0,0.0
9,truc,2015,COM,28348363,28348363,0,0.0


In [18]:
df = df.groupby('year')['plan_count', 'src_plan_count', 'plan_diff'].sum()
df['percent_diff'] = 100. * abs(df['plan_count'] - df['src_plan_count']) / df['src_plan_count']
df

  df = df.groupby('year')['plan_count', 'src_plan_count', 'plan_diff'].sum()


Unnamed: 0_level_0,plan_count,src_plan_count,plan_diff,percent_diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,60802183,60802183,0,0.0
2012,60850345,60850345,0,0.0
2013,48008972,48008972,0,0.0
2014,51127358,51127358,0,0.0
2015,30547996,30547996,0,0.0
2016,30856457,30856457,0,0.0
2017,27620062,27620062,0,0.0
2018,28218353,28218353,0,0.0
2019,27021218,27026699,-5481,0.02028
2020,25009767,25016511,-6744,0.026958


## State

In [19]:
query = '''with truven_enroll as (
    select 'truc' as data_source, year, enrolid, egeoloc
    from truven.ccaet
    union
    select 'trum' as data_source, year, enrolid, egeoloc
    from truven.mdcrt
),
truven_plans as (
    select data_source, year::int, case when length(s.abbr) > 2 then '' when m.egeoloc is null then 'null' else s.abbr  end as state, count(*) state_count
    from truven_enroll m
    left outer join reference_tables.ref_truven_state_codes s 
    on m.egeoloc=s.truven_code
    group by 1,2,3
), dw_plans as (
    select data_source, year, case when state is null then 'null' else state end as state, count(*) state_count
    from dw_staging.truc_member_enrollment_yearly
    group by 1,2,3
    union
    select data_source, year, case when state is null then 'null' else state end as state, count(*) state_count
    from dw_staging.trum_member_enrollment_yearly
    group by 1,2,3
)
select a.*, b.state_count as src_state_count, 
        a.state_count - b.state_count as plan_diff, 
        100. * abs(a.state_count - b.state_count) / b.state_count as plan_diff_percent
from truven_plans b
full join dw_plans a
on a.year = b.year
and a.state = b.state
and a.data_source = b.data_source
order by year;
'''

df = pd.read_sql(query,  con=connection)
df



Unnamed: 0,data_source,year,state,state_count,src_state_count,plan_diff,plan_diff_percent
0,truc,2011,CA,6050362,6071691,-21329,0.351286
1,truc,2011,AR,275980,278939,-2959,1.060805
2,truc,2011,MI,1524527,1534759,-10232,0.666684
3,trum,2011,MI,386341,388609,-2268,0.583620
4,truc,2011,PR,15856,16055,-199,1.239489
...,...,...,...,...,...,...,...
1267,trum,2022,NJ,24226,24448,-222,0.908050
1268,trum,2022,MN,3816,3877,-61,1.573381
1269,truc,2022,NJ,580550,588508,-7958,1.352233
1270,truc,2022,NM,41289,42205,-916,2.170359


We see that there is a large difference in state values assigned using DW logic compared to raw data. Ideally the difference should show that there are more values in the raw data compared to the values in DW due, especially for state values of NULL and ''. When this happens, it means that the member now has a valid state value which can be used to group these member in future projects.

In [20]:
df.sort_values('plan_diff_percent', ascending=False)

Unnamed: 0,data_source,year,state,state_count,src_state_count,plan_diff,plan_diff_percent
412,truc,2014,,1112700,1541140,-428440,27.800200
1207,trum,2022,ID,3,4,-1,25.000000
1019,truc,2020,PR,665,877,-212,24.173318
797,truc,2018,PR,848,1088,-240,22.058824
193,truc,2012,,1192425,1525493,-333068,21.833466
...,...,...,...,...,...,...,...
406,trum,2014,LA,133239,133422,-183,0.137159
287,trum,2013,LA,133839,134009,-170,0.126857
476,trum,2015,LA,109369,109464,-95,0.086787
542,trum,2016,LA,110923,111000,-77,0.069369


In [21]:
df['state'].unique()

array(['CA', 'AR', 'MI', 'PR', 'PA', 'AL', 'MN', 'AZ', 'CO', 'NV', 'ID',
       'SC', 'MA', 'NC', 'NY', '', 'DC', 'SD', 'HI', 'NJ', 'KS', 'UT',
       'WI', 'DE', 'RI', 'FL', 'NE', 'KY', 'AK', 'VA', 'TX', 'NM', 'OR',
       'ND', 'MT', 'ME', 'WV', 'CT', 'MS', 'IL', 'VT', 'WY', 'IN', 'WA',
       'MO', 'NH', 'LA', 'OH', 'TN', 'GA', 'MD', 'OK', 'IA', 'null'],
      dtype=object)

In [22]:
df[df['state'] == '']

Unnamed: 0,data_source,year,state,state_count,src_state_count,plan_diff,plan_diff_percent
17,truc,2011,,1661011,1783095,-122084,6.846747
21,trum,2011,,121195,133778,-12583,9.405881
168,trum,2012,,141729,161825,-20096,12.418353
193,truc,2012,,1192425,1525493,-333068,21.833466
274,trum,2013,,80231,99161,-18930,19.090166
282,truc,2013,,1197652,1531795,-334143,21.81382
355,trum,2014,,82177,98319,-16142,16.417986
412,truc,2014,,1112700,1541140,-428440,27.8002
425,truc,2015,,85822,103218,-17396,16.85365
453,trum,2015,,2634,3079,-445,14.452744


In [23]:
query = '''with truven_enroll as (
    select 'truc' as data_source, year, enrolid, egeoloc
    from truven.ccaet
    union
    select 'trum' as data_source, year, enrolid, egeoloc
    from truven.mdcrt
)
    select data_source, year, case when length(s.abbr) > 2 then '' else s.abbr end as state, count(*) state_count
    from truven_enroll m
    left outer join reference_tables.ref_truven_state_codes s 
    on m.egeoloc=s.truven_code
    group by 1,2,3'''

temp = pd.read_sql(query,  con=connection)



In [24]:
temp

Unnamed: 0,data_source,year,state,state_count
0,truc,2014.0,DE,186096
1,trum,2019.0,UT,948
2,truc,2017.0,AK,19816
3,truc,2020.0,NJ,673804
4,truc,2020.0,VA,808050
...,...,...,...,...
1267,trum,2014.0,,98319
1268,trum,2017.0,MD,6910
1269,trum,2018.0,AK,157
1270,trum,2019.0,FL,122781


In [25]:
temp[temp['state'].isin([None])]

Unnamed: 0,data_source,year,state,state_count
53,truc,2021.0,,10865
152,trum,2022.0,,196
534,trum,2021.0,,201
948,truc,2022.0,,11007
