In [54]:
import pandas as pd
import numpy as np
# from datetime import datetime
# import requests

import altair as alt
from vega_datasets import data
import plotly.express as px

### Read Data

In [14]:
# Data URLs
us_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv"
states_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
regions_url = "https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv"

In [15]:
# Read Data
us = pd.read_csv(us_url)
states = pd.read_csv(states_url)
regions = pd.read_csv(regions_url)

### Process Data

In [25]:
saved_data_pth = "data"

Region Data

In [16]:
regions_extra = pd.DataFrame({
    'State': ["Puerto Rico", "Virgin Islands", "Guam", "Northern Mariana Islands"],
    'State.Code': ["RR", "VI", "GU", "MP"],
    'Region': ["Other", "Other", "Other", "Other"],
    'Division': ["None", "None", "None", "None"]
})

regions = pd.concat([regions, regions_extra])
regions.head()

Unnamed: 0,State,State Code,Region,Division,State.Code
0,Alaska,AK,West,Pacific,
1,Alabama,AL,South,East South Central,
2,Arkansas,AR,South,West South Central,
3,Arizona,AZ,West,Mountain,
4,California,CA,West,Pacific,


In [23]:
# Save data
regions.to_csv(f"{saved_data_pth}/regions.csv", index=False)

State Data

In [17]:
# Change the date column from character format to date format and state column from character to factor format
states['date'] = pd.to_datetime(states['date'])

# Check to see if we calculate the right daily cases and daily deaths
states['daily_cases'] = states.groupby('state')['cases'].diff().fillna(states['cases'])
states['daily_deaths'] = states.groupby('state')['deaths'].diff().fillna(states['deaths'])
states.head()

Unnamed: 0,date,state,fips,cases,deaths,daily_cases,daily_deaths
0,2020-01-21,Washington,53,1,0,1.0,0.0
1,2020-01-22,Washington,53,1,0,0.0,0.0
2,2020-01-23,Washington,53,1,0,0.0,0.0
3,2020-01-24,Illinois,17,1,0,1.0,0.0
4,2020-01-24,Washington,53,1,0,0.0,0.0


In [18]:
# Merge the Region column to states dataframe
states = pd.merge(states, regions[['State', 'Region']], left_on='state', right_on='State', how='left')
states['Region'] = pd.Categorical(states['Region'], categories=["Northeast", "Midwest", "West", "South", "Other"], ordered=True)
states.drop(columns=["State"], inplace=True)
states.head()

Unnamed: 0,date,state,fips,cases,deaths,daily_cases,daily_deaths,Region
0,2020-01-21,Washington,53,1,0,1.0,0.0,West
1,2020-01-22,Washington,53,1,0,0.0,0.0,West
2,2020-01-23,Washington,53,1,0,0.0,0.0,West
3,2020-01-24,Illinois,17,1,0,1.0,0.0,Midwest
4,2020-01-24,Washington,53,1,0,0.0,0.0,West


In [20]:
# Rename variables
states = states.rename(columns={'state': 'State', 'date': 'Date', 'cases': 'Total_Cases', 'deaths': 'Total_Deaths'})
states.head()

Unnamed: 0,Date,State,fips,Total_Cases,Total_Deaths,daily_cases,daily_deaths,Region
0,2020-01-21,Washington,53,1,0,1.0,0.0,West
1,2020-01-22,Washington,53,1,0,0.0,0.0,West
2,2020-01-23,Washington,53,1,0,0.0,0.0,West
3,2020-01-24,Illinois,17,1,0,1.0,0.0,Midwest
4,2020-01-24,Washington,53,1,0,0.0,0.0,West


In [26]:
# Save data
states.to_csv(f"{saved_data_pth}/states.csv", index=False)

US Data

In [19]:
# Change the date column from character format to date format
us['date'] = pd.to_datetime(us['date'])

# Calculate the daily cases and daily deaths
us['daily_cases'] = np.concatenate(([us['cases'].iloc[0]], np.diff(us['cases'])))
us['daily_deaths'] = np.concatenate(([us['deaths'].iloc[0]], np.diff(us['deaths'])))
us.head()

Unnamed: 0,date,cases,deaths,daily_cases,daily_deaths
0,2020-01-21,1,0,1,0
1,2020-01-22,1,0,0,0
2,2020-01-23,1,0,0,0
3,2020-01-24,2,0,1,0
4,2020-01-25,3,0,1,0


In [27]:
# Save data
us.to_csv(f"{saved_data_pth}/us.csv", index=False)

### Data Exploration

#### Heatmap

In [33]:
states['year'] = states['Date'].dt.year
states.head()

Unnamed: 0,Date,State,fips,Total_Cases,Total_Deaths,daily_cases,daily_deaths,Region,year
0,2020-01-21,Washington,53,1,0,1.0,0.0,West,2020
1,2020-01-22,Washington,53,1,0,0.0,0.0,West,2020
2,2020-01-23,Washington,53,1,0,0.0,0.0,West,2020
3,2020-01-24,Illinois,17,1,0,1.0,0.0,Midwest,2020
4,2020-01-24,Washington,53,1,0,0.0,0.0,West,2020


In [37]:
# Convert numbers from scientific notation to normal notation
pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [40]:
states_by_yr_avg = states.groupby(['year', 'State'])[['Total_Cases']].mean().reset_index()
states_by_yr_avg.head()

Unnamed: 0,year,State,Total_Cases
0,2020,Alabama,109646.258503
1,2020,Alaska,9675.00678
2,2020,Arizona,138261.225806
3,2020,Arkansas,62183.675676
4,2020,California,511605.777778


In [42]:
alt.themes.enable("dark")

heatmap = alt.Chart(states_by_yr_avg).mark_rect().encode(
        y=alt.Y('year:O', axis=alt.Axis(title="Year", titleFontSize=16, titlePadding=15, titleFontWeight=900, labelAngle=0)),
        x=alt.X('State:O', axis=alt.Axis(title="States", titleFontSize=16, titlePadding=15, titleFontWeight=900)),
        color=alt.Color('max(Total_Cases):Q',
                         legend=alt.Legend(title=" "),
                         scale=alt.Scale(scheme="blueorange")),
        stroke=alt.value('black'),
        strokeWidth=alt.value(0.25),
    ).properties(width=900
    ).configure_axis(
    labelFontSize=12,
    titleFontSize=12
    )

heatmap

#### Choropleth

In [46]:
states_by_yr_avg_2021 = states_by_yr_avg[states_by_yr_avg['year'] == 2021]
states_by_yr_avg_2021.head()

Unnamed: 0,year,State,Total_Cases
55,2021,Alabama,624065.241096
56,2021,Alaska,87545.010959
57,2021,American Samoa,5.574257
58,2021,Arizona,960222.969863
59,2021,Arkansas,396678.117808


In [51]:
states

UrlData({
  format: TopoDataFormat({
    feature: 'states',
    type: 'topojson'
  }),
  url: 'https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/us-10m.json'
})

In [53]:
alt.themes.enable("dark")

states = alt.topo_feature(data.us_10m.url, 'states')

alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('Total_Cases:Q', scale=alt.Scale(scheme='blues')),   # scale=color_scale
    stroke=alt.value('#154360')
).transform_lookup(
    lookup='State',
    from_=alt.LookupData(states_by_yr_avg_2021, 'id', list(states_by_yr_avg_2021.columns))
).properties(
    width=500,
    height=300
).project(
    type='albersUsa'
)

In [57]:
choropleth = px.choropleth(states_by_yr_avg_2021, locations='State', color='Total_Cases', locationmode="USA-states",
                               color_continuous_scale='blues',
                               range_color=(0, max(states_by_yr_avg_2021.Total_Cases)),
                               scope="usa",
                               labels={'Total_Cases':'Total Cases'}
                              )
choropleth.update_layout(
        template='plotly_dark',
        plot_bgcolor='rgba(0, 0, 0, 0)',
        paper_bgcolor='rgba(0, 0, 0, 0)',
        margin=dict(l=0, r=0, t=0, b=0),
        height=350
    )

choropleth

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed