In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# Install a conda package in the current Jupyter kernel
#import sys
#!conda install --yes --prefix {sys.prefix} plotly==4.8.2

import plotly as py

# df = pd.read_csv('C:/Users/Shinichi/Documents/Jupyter Notebook/us-counties.csv')
df = pd.read_csv("/kaggle/input/us-counties-covid-19-dataset/us-counties.csv")

In [5]:
# Where total cases summed by counties within each state 

hold = df.groupby(df['state']).agg({'cases' : ['max']})
hold.columns = ['total-cases']
cases = hold.nlargest(20, ['total-cases'])

# plot of the 20th largest total cases across the US
py.plot(x = 'total-cases', y = cases.index, 
        data_frame = cases, 
        kind = 'bar', 
        title='Top 20 states with highest number of cases (COVID-19)')


My initial thought is, it makes sense that the (population) densist states have the largest number of cases. But I began to wonder what the same ranking system would look like on population density of US states. So I pulled a dataset on the popluation in each state, added a column for the average area of the state, and another column for the density. 

After plotting, i've realized this wasn't much of an indicator for COVID-19 spread as the density doesn't exactly agree with sread. Perhaps a better indicator would be the density of counties in each state.  

I think it would be better to visualize the cases by county and work with counties from here on out.

In [7]:
# # Get data on population density of each US state and compare top 20 results to the COVID-19 cases
# popdens = pd.read_csv('C:/Users/Shinichi/Documents/Jupyter Notebook/pop-density.csv', thousands=',')
# dens_df = popdens.drop(
#     columns=['Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8','Unnamed: 9','Unnamed: 10','Unnamed: 11','Unnamed: 12']
#                        , axis=1)
# dens_df.head(14)
# dens_df = dens_df.dropna(subset=['State','Density'])

# # Dropped the pop density of District columbia, as it was not within our dataset. Heavily skewed data
# # This could create some problems in accuracy
# dens_df = dens_df.drop(index=8)

# # Plotting top 20 population densities in US states
# density_top = dens_df.sort_values(['Density'], ascending=False).head(21)
# py.plot(x='Density', y=density_top.State, 
#           data_frame=density_top, kind='bar', 
#           title='Top 20 population dense states (SQ mile)')

In [8]:
hold2 = df.groupby(df['county']).agg({'cases' : ['max']})
hold2.columns = ['total-cases']
hold3 = hold2.nlargest(20, ['total-cases'])

py.plot(x='total-cases', y=hold3.index,
       data_frame = hold3,
       kind = 'bar',
       title = 'Top 20 COVID-19 cases by county')

Here I wanted to see the deaths due to COVID-19 positive individuals. So I parsed the data again, but for total deaths in each county. 
Out of shear surprise, NYC total death count over exceeds any other county's.. almost 50 times that of the next leading death count in Cook, Illinois.

In [9]:
df2 = df.groupby(df['county']).agg({'deaths' : ['max']})
df2.columns = ['total-deaths']
df3 = df2.nlargest(20, ['total-deaths'])

py.plot(x='total-deaths', y=df3.index,
       data_frame = df3,
       kind = 'bar',
       title = 'Top 20 COVID-19 deaths by county')

Having a visual for total death count is nice, but I want to see the relative values in each county. So I made a new column containing a 'percent' death, which i defined as the (total deaths / total cases).

This should reveal a better understanding of the mortality rate of the virus in each county. 

In [10]:
# Creating a new column containing the % death rate in total cases in each county
df2['percent-death-cases'] = (df2['total-deaths']) / hold2['total-cases']

# Dropping the counties not within the top 20 death count from above. (df.index)
for i, row in df2.iterrows():
    if i not in df3.index:
        df2 = df2.drop(index=i)

# Again sorting from largest to smallest
df_percent_l = df2.nlargest(20, ['percent-death-cases'])

# Plotting
py.plot(x='percent-death-cases', y=df_percent_l.index,
       data_frame = df_percent_l,
       kind = 'bar',
       title = 'Top 20 percent deaths by county'
       )

There is a data point with {unknown} inside of the {county} column, which is a bit confusing because we don't know where these counties are and it skews the data. I found out that within this entry contains 50 states, so the data above is the deaths per cases ratio summed across multiple regions. I think it is best to overlook this data and drop the info, until the unknown counites can be accounted for.

Below I did just that, drop the unknowns and re-plot the data. We can see the largest data point for deaths/cases is Wayne, Michigan where our previous #1 cases (NYC) is below the top 3 'death rate'. 

It would be interesting to see the state of public health in these regions and how well protocols (that are in place) are effetively reducing, or increasing the spread rate, or perhaps even the state of public cooperation with community health concerns.

In [11]:
unk_county = df[df['county']=='Unknown'].groupby(df['state']).state.unique()
unk_county.head(5)
len(unk_county)

df2 = df2.drop(index='Unknown')
df_percent_l = df2.nlargest(20, ['percent-death-cases'])
py.plot(x='percent-death-cases', y=df_percent_l.index,
       data_frame = df_percent_l,
       kind = 'bar',
       title = 'Top 20 deaths per cases by county'
       )

I wanted to see how the cases in the US are behaving through a time scale. Below is a simple method grouping by date and plot of the total number of cases in a day.

The plot is fairly linear after April, so thankfully the initial prediction of the cases/outbreak being exponential is not true. However, the bad news is that the number of cases throughout the US is still increasing. We can even see a slight increase in the rate of covid cases in June. A little frightening..

In [12]:
df_date = df.groupby(df['date']).agg({'cases': ['sum','max']})
df_date.columns = ['total-cases-in-day', 'max']
py.plot(x = df_date.index,
        y = 'total-cases-in-day',
        data_frame = df_date,
        kind = 'line',
        title = 'Spread of COVID-19 cases across the US from 01/21/20 to 07/04/20'
)

Now that rates of change has piqued my interest, I want to figure out is how fast the number of cases is 'moving'. Essentially trying to figure out the 'velocity' of the spread and plotting against time. This will also allow us to see how fast the spread is accelerating day by day, month by month.

Below is a simple plot showing how these cases have been increasing (on a daily basis) throughout late January to early July. 

As for the rises and sinks in the plot, the data is the difference between (the spread count of a specific day) and (the spread count of the previous day) so rises as increased rates of cases and sinks as decreased rates. 

This plot is very 'noisy' with a fluctuating pattern. It could be presented a bit smoother if we increase the time scale to weeks, or months instead of days. 

In [13]:
df_diff = df_date.diff()
df_diff.columns = ['Change-in-cases-by-day', 'diff-max']

py.plot(x = df_diff.index,
        y = 'Change-in-cases-by-day',
        data_frame = df_diff,
        kind = 'line',
        title = 'Rate of change of COVID-19 cases across the US from 01/21/20 to 07/04/20'
)

In [14]:
df_acc = df_diff.diff()
df_acc.columns = ['acceleration of spread', 'diff-max']

py.plot(x = df_acc.index,
        y = 'acceleration of spread',
        data_frame = df_acc,
        kind = 'line',
        title = 'Acceleration of COVID-19 cases across the US from 01/21/20 to 07/04/20'
)

Making the visuals a little more legible, I plotted the distribution of covid counts on a month to month scale. Each 'bin' resets, or has the previous months count removed from the current month's total. This way is another visual representation of how COVID-19 is behaving across the US.

In [None]:
df_date['month'] = pd.DatetimeIndex(df_date.index).month
df_month_diff = df_date.groupby(df_date['month']).agg({'total-cases-in-day' : ['max']}).diff()
df_month_diff.fillna(7)
df_month_diff.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July']
df_month_diff.columns = ['new-cases-in-month']

py.plot(x = df_month_diff.index,
         y = 'new-cases-in-month',
         data_frame = df_month_diff,
         kind = 'bar',
         title = 'New COVID-19 cases each month'
        )