In [31]:
#@title
# How to update the data:
# 1. You may need to copy this colab so you have your own version.
# 2. Update the history constants below to have the latest data's suffix.
# 3. Update the date history constants below to be the last case date included in the data.
# 4. Updates the scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
#    Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
#    the paper including text and tables, a lot of that is done in commented out PrintSummaryStats() statements.

import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
pd.options.display.float_format = '{:,.2f}'.format

# Table name suffixes.
DATE_HISTORY = [
    '20200529',
    '20200627',
    '20200731',
    '20200831',
    '20200930',
    '20201031',
    '20201204',
    '20201231',
    '20210131',
    '20210228',
    '20210331',
    '20210430',
    '20210607',
    '20210621',
    '20210706',
    '20210719',
]

# Cases up to these dates for querying NYT case counts.
DATE_UP_TO_HISTORY = [
    'DATE(2020, 05, 14)',
    'DATE(2020, 06, 12)',
    'DATE(2020, 07, 16)',
    'DATE(2020, 08, 17)',
    'DATE(2020, 09, 15)',
    'DATE(2020, 10, 16)',
    'DATE(2020, 11, 19)',
    'DATE(2020, 12, 16)',
    'DATE(2021, 01, 16)',
    'DATE(2021, 02, 13)',
    'DATE(2021, 03, 16)',
    'DATE(2021, 04, 15)',
    'DATE(2021, 05, 21)',
    'DATE(2021, 06, 07)',
    'DATE(2021, 06, 22)',
    'DATE(2021, 07, 05)',
]

# Cases up to these dates for charts.
DATE_UP_TO_DISPLAY_NAME_HISTORY = [
    'May 14',
    'Jun 12',
    'Jul 16',
    'Aug 17',
    'Sep 15',
    'Oct 16',
    'Nov 19',
    'Dec 16',
    'Jan 16',
    'Feb 13',
    'Mar 16',
    'Apr 15',
    'May 21',
    'Jun 07',
    'Jun 22',
    'Jul 05',
]

# Dates for Altair line charts; the 1st of the month formatted to the previous day.
DATE_YEAR_HISTORY = [
    '2020-05-28',
    '2020-06-28',
    '2020-07-28',
    '2020-08-28',
    '2020-09-28',
    '2020-10-28',
    '2020-11-28',
    '2020-12-28',
    '2021-01-28',
    '2021-02-28',
    '2021-03-28',
    '2021-04-28',
    '2021-05-28',
    '2021-06-28',
    '2021-06-28',
    '2021-07-28',
]

# Dates for Altair bar charts, which were less particular than the line charts.
DATE_YEAR_DISPLAY_NAME_HISTORY = [
    'May 2020',
    'Jun 2020',
    'Jul 2020',
    'Aug 2020',
    'Sep 2020',
    'Oct 2020',
    'Nov 2020',
    'Dec 2020',
    'Jan 2021',
    'Feb 2021',
    'Mar 2021',
    'Apr 2021',
    'May 2021',
    'Jun 2021',
    'Jun 2021',
    'Jul 2021',
]

# Project and table names.
PROJECT_ID = 'msm-secure-data-1b'

def FullTableName(date):
  return '`%s.ndunlap_secure.cdc_restricted_access_%s`' % (PROJECT_ID, date)

CDC_TABLE_HISTORY = [FullTableName(date) for date in DATE_HISTORY]
CDC_TABLE = CDC_TABLE_HISTORY[-1]

# Dates in different formats.
DATE = DATE_UP_TO_HISTORY[-1]
DATE_DISPLAY_NAME = DATE_UP_TO_DISPLAY_NAME_HISTORY[-1]

# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
TOTAL_CASES_SCALE_MAX = 4000000

# Chart settings.
SCATTER_HEIGHT = 300
SCATTER_WIDTH = 300
MAP_HEIGHT = 300
MAP_WIDTH = 450
US_STATES_TOPO = alt.topo_feature(data.us_10m.url, 'states')
US_COUNTIES_TOPO = alt.topo_feature(data.us_10m.url+"#", 'counties')

TERRITORIES = ('PR', 'GU', 'VI', 'MP', 'AS')
NYT_TERRITORIES = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
STATES_TO_FIPS = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12,
                  'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23,
                  'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34,
                  'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45,
                  'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56,
                  'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78, 'USVI': 78, 'NYC': 36}
FIPS_TO_STATES = {STATES_TO_FIPS[key]: key for key in STATES_TO_FIPS}
RACE_ETHNICITY_COMBINED_MAP = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}
RACE_ETHNICITY_COMBINED_KNOWN_MAP = {
   'Asian, Non-Hispanic': 'cdc_known_cases',
   'Black, Non-Hispanic': 'cdc_known_cases',
   'White, Non-Hispanic': 'cdc_known_cases',
   'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
   'Hispanic/Latino': 'cdc_known_cases',
   'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
   'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
   'Missing': 'cdc_unknown_cases',
   'Unknown': 'cdc_unknown_cases',
   'NA': 'cdc_na_cases',
}

In [32]:
#@title

NYT_US_QUERY_STR = ('''
SELECT
  sum(confirmed_cases) as nyt_cases,
  FROM `bigquery-public-data.covid19_nyt.us_states`
  WHERE
  date = %s
''')

NYT_STATES_QUERY_STR = ('''
SELECT
  state_name,
  state_fips_code,
  confirmed_cases as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''')

NYT_STATES_QUERY = NYT_STATES_QUERY_STR % DATE

CDC_STATES_QUERY_STR = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_state
''')

CDC_STATES_QUERY = CDC_STATES_QUERY_STR % CDC_TABLE

CDC_STATES_RACE_QUERY_STR = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_state,
   race_ethnicity_combined
''')

CDC_STATES_RACE_QUERY = CDC_STATES_RACE_QUERY_STR % CDC_TABLE

RACE_ETHNICITY_COMBINED_STR = '''
  CASE
    WHEN ethnicity = "Non-Hispanic/Latino" AND race != "Missing" THEN race
    WHEN ethnicity = "Hispanic/Latino" THEN ethnicity
    WHEN ethnicity = "NA" THEN "NA"
    WHEN ethnicity = "Missing" AND race = "Missing" THEN "Missing"
    ELSE "Unknown"
  END
'''

In [33]:
#@title
# This either works for a list of fields and one table or a list of tables and one field. 
def FieldAnalysis(project_id, table_list, field_list, title, calculate_race_ethnicity=False):
  field_dict = {}
  if len(field_list) > 1:
    list_for_iteration = field_list
    table = table_list[0]
  else:
    list_for_iteration = table_list
    field = field_list[0]
  for item in list_for_iteration:
    field_dict[item] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(field_dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_known_history = []
  total_count_history = []
  field_series = []
  value_series = []
  percent_series = []
  cases_series = []
  chart_denominator = 1000000

  field_display_name = {
    'cdc_case_earliest_dt': 'CDC earliest case date',
    'current_status': 'Case status',
    'case_month': 'Case month',
    'res_state': 'State',
    'res_county': 'County',
    'sex': 'Sex',
    'age_group': 'Age',
    'race': 'Race',
    'ethnicity': 'Ethnicity',
    'race_ethnicity_combined': 'Race/Ethnicity',
  }
  for i, item in enumerate(list_for_iteration):
    if len(field_list) > 1:
      field = item
    else:
      table = item    
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    GROUP BY
      %s
    ''')
    if (field == 'race_ethnicity_combined' and (
        calculate_race_ethnicity or item == CDC_TABLE_HISTORY[0])):
      field_unknowns_query = ('''
      SELECT ''' + RACE_ETHNICITY_COMBINED_STR + ''' 
        as %s,
        count(*) as cases
      FROM
        %s
      GROUP BY
        %s
      ''')
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    if '' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc[''].cases
    if 'OTH' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['OTH'].cases
    if 'nul' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['nul'].cases
    unknowns.loc['Missing', field] = missing_count

    unknown_count = 0
    if 'Unknown' in field_unknowns_df.index:
      unknown_count += field_unknowns_df.loc['Unknown'].cases
    unknowns.loc['Unknown', field] = unknown_count
    na_count = 0
    if 'NA' in field_unknowns_df.index:
      na_count += field_unknowns_df.loc['NA'].cases
    unknowns.loc['NA', field] = na_count
    unknowns.loc['Known', field] = field_unknowns_df.cases.sum() - (
        unknowns.loc['Missing', field] +
        unknowns.loc['Unknown', field] +
        unknowns.loc['NA', field])
    if len(field_list) > 1:
      field_series.extend([field_display_name.get(field, field)] * 4)
    else:
      field_series.extend([DATE_YEAR_DISPLAY_NAME_HISTORY[i]] * 4)
      field_known_history.append(unknowns.loc['Known', field] / field_unknowns_df.cases.sum())
      total_count_history.append(field_unknowns_df.cases.sum())
    value_series.extend(['Known', 'Suppressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['NA', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['Unknown', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['Missing', field] / field_unknowns_df.cases.sum()])
    cases_series.extend([unknowns.loc['Known', field] / chart_denominator,
                           unknowns.loc['NA', field] / chart_denominator,
                           unknowns.loc['Unknown', field] / chart_denominator,
                           unknowns.loc['Missing', field] / chart_denominator])
    bars_df = pd.DataFrame.from_dict({'field': field_series,
                                    'value': value_series,
                                   'percent': percent_series,
                                   'cases': cases_series})
  chart = alt.Chart(bars_df).mark_bar().encode(
      x=alt.X('percent:Q', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field:N', title='Field', sort=field_list),
      color=alt.Color('value:N', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
      ]
  ).properties(title=title)
  return total_count_history, field_known_history, chart

def CreateNYTStateDataframe(query, include_territories=False):
  nyt_states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  if not include_territories:
    for territory in NYT_TERRITORIES:
     nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
  nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
  nyt_states_df.set_index('state_fips_code', inplace=True)
  return nyt_states_df

def CreateCDCStateDataframe(query):
  states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
    states_df = states_df[states_df.res_state != state]
  states_df.rename(columns={'res_state': 'state'}, inplace=True)
  states_df['state_fips_code'] = states_df.state
  states_df = states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
  states_df['state_fips_code'] = pd.to_numeric(states_df.state_fips_code, 'coerce').fillna(-1).astype(int)
  states_df.set_index('state_fips_code', inplace=True)
  return states_df

def CreateCDCStateRaceDataframe(query, cases_field_prefix, include_territories=False):
  states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
    states_df = states_df[states_df.res_state != state]

  states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
  states_df = states_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_KNOWN_MAP})
  states_df.rename(columns={'res_state': 'state'}, inplace=True)

  cases_field = cases_field_prefix + 'cases' 
  crosstab_df = pd.crosstab(states_df['state'],
                            states_df.race_ethnicity_combined,
                            values=states_df[cases_field],
                            aggfunc=sum,
                            margins=True,
                            margins_name=cases_field
  )
  # Have to reset_index() to go from pandas multi-index to single index.
  crosstab_df = crosstab_df.reset_index()
  crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
  crosstab_df[cases_field_prefix + 'known_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
  crosstab_df[cases_field_prefix + 'known_or_na_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_unknown_cases.fillna(0)
  crosstab_df[cases_field_prefix + 'known_cases_percent'] = round(
      crosstab_df[cases_field_prefix + 'known_cases'] /
      crosstab_df[cases_field], 4)
  crosstab_df[cases_field_prefix + 'known_or_na_cases_percent'] = round(
      crosstab_df[cases_field_prefix + 'known_or_na_cases'] /
      crosstab_df[cases_field], 4)
  crosstab_df['state_fips_code'] = crosstab_df.state
  crosstab_df = crosstab_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})

  # Remove territories and missing states for calculating summary stats.
  if not include_territories:
    for territory in TERRITORIES:
      crosstab_df = crosstab_df[crosstab_df.state != territory]
  crosstab_df = crosstab_df[crosstab_df.state != 'NA']
  crosstab_df = crosstab_df[crosstab_df.state != 'Missing']
  crosstab_df = crosstab_df[crosstab_df.state != 'Unknown']
  crosstab_df.set_index('state_fips_code', inplace=True)
  return crosstab_df

def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = US_STATES_TOPO
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = US_COUNTIES_TOPO

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(US_STATES_TOPO).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(US_STATES_TOPO).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def CreateLineChart(title, dates, race_ethnicity_known_history, overall_nyt_percent, overall_composite):
  line_chart_dict = {}
  line_chart_dict['date'] = dates * 3
  line_chart_dict['metric'] = (['Percent with Race/Ethnicity'] * len(dates) +
                               ['Percent of NYT total case counts'] * len(dates) +
                               ['Composite percent'] * len(dates)

  )
  line_chart_dict['value'] = (race_ethnicity_known_history +
                              overall_nyt_percent +
                              overall_composite
                              )
  line_chart_df = pd.DataFrame(line_chart_dict)
  line = alt.Chart(line_chart_df).mark_line(point=True).encode(
      x=alt.X('yearmonth(date):O', title='', axis=alt.Axis(labelAngle=0)),
      y=alt.Y('value:Q', title='', axis=alt.Axis(format='%')),
      color=alt.Color('metric', title='', scale=alt.Scale(scheme='browns'),
                      sort=['Percent of NYT total case counts',
                            'Percent with Race/Ethnicity',
                            'Composite percent',
                           ]),
  )

  nearest = alt.selection(type='single', nearest=True, on='mouseover',
                          fields=['date'], empty='none')

  # Transparent selectors across the chart. This is what tells us
  # the x-value of the cursor
  selectors = alt.Chart(line_chart_df).mark_point().encode(
      x='yearmonth(date):O',
      opacity=alt.value(0),
  ).add_selection(
      nearest
  )

  # Draw points on the line, and highlight based on selection
  points = line.mark_point().encode(
      opacity=alt.condition(nearest, alt.value(1), alt.value(0))
  )

  # Draw text labels near the points, and highlight based on selection
  text = line.mark_text(align='left', dx=7, dy=-7).encode(
      text=alt.condition(nearest, 'value:Q', alt.value(' '), format='.0%')
  )

  # Put the five layers into a chart and bind the data
  line_chart = alt.layer(
      line, selectors, points, text
  ).properties(
      title=title,
      width=550,
      height=300
  )
  return line_chart

def PrintSummaryStats(chart_df, field='percent'):
  plus_minus_15_df = chart_df[chart_df[field] >= .85]
  plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
  print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
  plus_minus_50_df = chart_df[chart_df[field] >= .50]
  plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
  print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())

In [34]:
#@title
total_count_history, race_ethnicity_known_history, historical_bar_chart = FieldAnalysis(
    PROJECT_ID, CDC_TABLE_HISTORY, ['race_ethnicity_combined'],
    'Race/Ethnicity Completeness by Dataset Release Date')

overall_nyt_percent = []
for i, table in enumerate(CDC_TABLE_HISTORY):
  nyt_us = pd.io.gbq.read_gbq(NYT_US_QUERY_STR % DATE_UP_TO_HISTORY[i], project_id=PROJECT_ID)
  overall_nyt_percent.append(total_count_history[i] / nyt_us.nyt_cases[0])

overall_composite = []
for i, _ in enumerate(total_count_history):
  overall_composite.append(race_ethnicity_known_history[i] * overall_nyt_percent[i])

In [35]:
#@title
CreateLineChart('CDC Restricted Access Dataset Completeness by Release Date',
                DATE_YEAR_HISTORY,
                race_ethnicity_known_history,
                overall_nyt_percent,
                overall_composite
).display()

In [36]:
#@title
historical_bar_chart.display()

In [37]:
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY)

cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)

In [39]:
#@title
cdc_nyt_state_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_state_title = 'Ratio of CDC to NYT Cases by State up to %s' % DATE_DISPLAY_NAME

CreateScatterPlotAndMap(
    cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_nyt_states_df)

In [40]:
#@title
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_')
cdc_states_race_df.reset_index(inplace=True)

cdc_race_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}

cdc_states_race_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_states_race_map = CreateMap(
    cdc_states_race_df, cdc_race_fields_dict, cdc_states_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)

cdc_nyt_states_race_df = cdc_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_race_df.reset_index(inplace=True)

cdc_nyt_states_race_df['percent'] = round(cdc_nyt_states_race_df.cdc_cases / cdc_nyt_states_race_df.nyt_cases, 2)
cdc_nyt_states_race_df['cases_max_100_percent'] = cdc_nyt_states_race_df.percent.clip(upper=1)
cdc_nyt_states_race_df['composite_percent'] = cdc_nyt_states_race_df.cases_max_100_percent * cdc_nyt_states_race_df.cdc_known_cases_percent

composite_fields_dict = {
    'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC percent of NYT total cases'},
    'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC percent of NYT total with race/ethnicity'},
}
composite_title = 'CDC Percent of NYT Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME

state_composite_map = CreateMap(
    cdc_nyt_states_race_df, composite_fields_dict, composite_title, 1, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)

(cdc_states_race_map | state_composite_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_states_race_df, 'cdc_known_cases_percent')

In [41]:
#@title
cdc_nyt_states_df.sort_values(by='percent')

Unnamed: 0,state_fips_code,state,cdc_cases,state_name,nyt_cases,nyt_deaths,percent
22,18,18,364,Indiana,758078,13863,0.0
49,56,WY,1334,Wyoming,62445,747,0.02
2,48,TX,81300,Texas,2999137,52634,0.03
8,22,LA,19847,Louisiana,482560,10757,0.04
50,54,WV,9256,West Virginia,164149,2899,0.06
45,29,MO,60216,Missouri,639225,9949,0.09
48,28,MS,52761,Mississippi,322186,7419,0.16
1,38,ND,24171,North Dakota,110776,1559,0.22
24,21,KY,281383,Kentucky,466278,7280,0.6
27,26,MI,655225,Michigan,998975,20995,0.66


In [None]:
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY, include_territories=True)

cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
cdc_nyt_states_df.set_index('state', inplace=True)

cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_', include_territories=True)
cdc_states_race_df.set_index('state', inplace=True)

row_names = [
    'CDC cases',
    'NYT cases',
    '(CDC as a % of NYT cases)',
    'Cases with race/ethnicity',
    '(% with race/ethnicity)',
    '(composite % of NYT total with race/ethnicity)'
]
cdc_cases = [cdc_nyt_states_df.loc[territory].cdc_cases for territory in TERRITORIES[:-1]]
cdc_cases = [float(i) for i in cdc_cases]
nyt_cases = [cdc_nyt_states_df.loc[territory].nyt_cases for territory in TERRITORIES[:-1]]
nyt_cases = [float(i) for i in nyt_cases]
nyt_percent = [str(round(cdc_nyt_states_df.loc[territory].percent * 100)) + '%' for territory in TERRITORIES[:-1]]
race_ethnicity_cases = [cdc_states_race_df.loc[territory].cdc_known_cases for territory in TERRITORIES[:-1]]
race_ethnicity_percent = [str(round(cdc_states_race_df.loc[territory].cdc_known_cases_percent * 100)) + '%' for territory in TERRITORIES[:-1]]
composite_percent = [str(round(cdc_nyt_states_df.loc[territory].percent * cdc_states_race_df.loc[territory].cdc_known_cases_percent * 100)) + '%' for territory in TERRITORIES[:-1]]

territories_data = list(zip(cdc_cases,
                            nyt_cases,
                            nyt_percent,
                            race_ethnicity_cases,
                            race_ethnicity_percent,
                            composite_percent))
table_data = {'Puerto Rico': territories_data[0], 'Guam': territories_data[1], 'Virgin Islands': territories_data[2], 'Northern Mariana Islands': territories_data[3]}
territories_df = pd.DataFrame(table_data, index=row_names)

pd.options.display.float_format = '{:,.0f}'.format
territories_df.head(n=10)

Unnamed: 0,Puerto Rico,Guam,Virgin Islands,Northern Mariana Islands
CDC cases,27023,7393,2620,84
NYT cases,170957,9100,3308,176
(CDC as a % of NYT cases),16%,81%,79%,48%
Cases with race/ethnicity,4661,5430,1934,0
(% with race/ethnicity),17%,73%,74%,0%
(composite % of NYT total with race/ethnicity),3%,60%,58%,0%
