# Covid Data Analysis

In [407]:
from datetime import date as dt, timedelta
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [409]:
fips_lookup_file = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv'
fips_lookup_df = pd.read_csv(fips_lookup_file)
column_map = {
    'Province/State': 'Province_State',
    'Country/Region': 'Country_Region',
    'Last Update': 'Last_Update',
    'Latitude': 'Lat',
    'Longitude': 'Long_'
}

In [540]:
data_path = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'
start_date = dt(2020, 1, 21)
end_date = dt.today()

In [541]:
covid_df = pd.DataFrame()
for date in pd.date_range(start_date, end_date):
    file = os.path.join(data_path, f'{date:%m-%d-%Y}.csv')
    try:
        temp_df = pd.read_csv(file)
        temp_df.rename(columns=column_map, inplace=True)
        temp_df['date'] = date
        covid_df = pd.concat([covid_df, temp_df])
    except:
        print(f'{file} not found')
        pass

https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/01-21-2020.csv not found
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/04-07-2020.csv not found


In [544]:
grouped_df = covid_df.groupby('Combined_Key')
covid_df['Confirmed_Rate'] = grouped_df['Confirmed'].pct_change()
covid_df['Death_Rate'] = grouped_df['Deaths'].pct_change()
covid_df['Recovery_Rate'] = grouped_df['Recovered'].pct_change()

In [545]:
covid_us_df = covid_df[covid_df['Country_Region'] == 'US']

In [546]:
latest = covid_df['date'].max()
latest_covid_df = covid_df[covid_df['date'] == latest]
latest_covid_us_df = latest_covid_df[latest_covid_df['Country_Region'] == 'US']

In [567]:
output_cols = ['Combined_Key', 'Confirmed', 'Deaths', 'Recovered', 'Confirmed_Rate', 'Death_Rate', 'Recovery_Rate', 'Lat', 'Long_']
ts_cols = ['date'] + output_cols

## Top Hotspots in US

In [568]:
top_hotspots_df = latest_covid_us_df.sort_values('Confirmed', ascending=False)[:15]
top_hotspots_df[output_cols]

Unnamed: 0,Combined_Key,Confirmed,Deaths,Recovered,Confirmed_Rate,Death_Rate,Recovery_Rate,Lat,Long_
1636,"New York City, New York, US",72181.0,3485.0,0.0,0.068541,0.54477,,40.767273,-73.971526
1618,"Nassau, New York, US",15616.0,162.0,0.0,0.084595,0.0,,40.740665,-73.589419
2441,"Westchester, New York, US",14294.0,211.0,0.0,0.041609,2.149254,,41.162784,-73.757417
2146,"Suffolk, New York, US",14185.0,199.0,0.0,0.080433,0.137143,,40.883201,-72.801217
513,"Cook, Illinois, US",8728.0,209.0,0.0,0.086383,0.123656,,41.841448,-87.816588
2419,"Wayne,Michigan,US",8270.0,346.0,0.0,0.100027,0.180887,,42.278056,-83.378611
153,"Bergen, New Jersey, US",6862.0,200.0,0.0,0.1091,0.058201,,40.960109,-74.071643
1347,"Los Angeles, California, US",6377.0,147.0,0.0,0.070865,0.113636,,34.308284,-118.228241
1944,"Rockland, New York, US",5703.0,119.0,0.0,0.070785,0.724638,,41.150279,-74.025605
1706,"Orleans, Louisiana, US",4565.0,171.0,0.0,0.122725,0.062112,,30.06972,-89.926603


In [583]:
fig = go.Figure()

# for place in places:
fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lat = latest_covid_us_df['Lat'],
        lon = latest_covid_us_df['Long_'],
        text = latest_covid_us_df['Combined_Key'],
        marker = dict(
            size=latest_covid_us_df['Confirmed']/100,
            sizemode='area'
        ),
        name='Confirmed'))
fig.update_layout(
    geo = dict(
        scope = 'usa'
    ),
    template='ggplot2')
fig.show(renderer='iframe_connected')

## Top Hotspots Global

In [550]:
latest_covid_df.sort_values('Confirmed', ascending=False)[:15][output_cols]

Unnamed: 0,Combined_Key,Confirmed,Deaths,Recovered,Confirmed_Rate,Death_Rate,Recovery_Rate
2781,Spain,136675.0,13341.0,40437.0,0.038201,0.055375,0.061896
2709,Italy,132547.0,16523.0,22837.0,0.02791,0.040033,0.046848
2690,Germany,103374.0,1810.0,28700.0,0.03247,0.142677,0.0
2686,France,98010.0,8911.0,17250.0,0.055699,0.10312,0.065933
1636,"New York City, New York, US",72181.0,3485.0,0.0,0.068541,0.54477,
2575,"Hubei, China",67803.0,3212.0,64014.0,0.0,0.000623,0.001079
2705,Iran,60500.0,3739.0,24236.0,0.039055,0.037746,0.22801
2799,United Kingdom,51608.0,5373.0,135.0,0.07953,0.088974,0.0
2795,Turkey,30217.0,649.0,1326.0,0.116295,0.130662,0.272553
2786,Switzerland,21657.0,765.0,8056.0,0.026398,0.06993,0.255807


## Top Hotspots New York

In [551]:
latest_covid_df[latest_covid_df['Province_State'] == 'New York'].sort_values('Confirmed', ascending=False)[:15][output_cols]

Unnamed: 0,Combined_Key,Confirmed,Deaths,Recovered,Confirmed_Rate,Death_Rate,Recovery_Rate
1636,"New York City, New York, US",72181.0,3485.0,0.0,0.068541,0.54477,
1618,"Nassau, New York, US",15616.0,162.0,0.0,0.084595,0.0,
2441,"Westchester, New York, US",14294.0,211.0,0.0,0.041609,2.149254,
2146,"Suffolk, New York, US",14185.0,199.0,0.0,0.080433,0.137143,
1944,"Rockland, New York, US",5703.0,119.0,0.0,0.070785,0.724638,
1700,"Orange, New York, US",3533.0,76.0,0.0,0.108566,0.433962,
675,"Dutchess, New York, US",1189.0,9.0,0.0,0.103993,0.125,
710,"Erie, New York, US",1148.0,30.0,0.0,0.084042,0.111111,
1554,"Monroe, New York, US",574.0,26.0,0.0,0.047445,0.368421,
2250,"Ulster, New York, US",382.0,4.0,0.0,0.067039,0.0,


## Top Counties in each State

In [552]:
states = ['New York', 'Washington', 'Texas']
top_counties_df = (latest_covid_us_df
                       .assign(rank=latest_covid_us_df
                                        .sort_values(['Confirmed'], ascending=False)
                                        .groupby('Province_State')
                                        .cumcount() + 1)
                       .query('rank <= 5')
                       .sort_values(['Province_State', 'rank'])[['Province_State'] + output_cols]
)
top_counties_df[top_counties_df['Province_State'].isin(states)]

Unnamed: 0,Province_State,Combined_Key,Confirmed,Deaths,Recovered,Confirmed_Rate,Death_Rate,Recovery_Rate
1636,New York,"New York City, New York, US",72181.0,3485.0,0.0,0.068541,0.54477,
1618,New York,"Nassau, New York, US",15616.0,162.0,0.0,0.084595,0.0,
2441,New York,"Westchester, New York, US",14294.0,211.0,0.0,0.041609,2.149254,
2146,New York,"Suffolk, New York, US",14185.0,199.0,0.0,0.080433,0.137143,
1944,New York,"Rockland, New York, US",5703.0,119.0,0.0,0.070785,0.724638,
956,Texas,"Harris,Texas,US",1809.0,22.0,0.0,0.296774,0.1,
583,Texas,"Dallas, Texas, US",1155.0,18.0,0.0,0.038669,0.0,
2228,Texas,"Travis, Texas, US",502.0,6.0,0.0,0.03719,0.0,
2184,Texas,"Tarrant, Texas, US",452.0,13.0,0.0,0.08134,0.181818,
162,Texas,"Bexar, Texas, US",410.0,12.0,0.0,0.067708,0.090909,


## Hotspot Analysis

In [608]:
top_hotspots =  top_hotspots_df['Combined_Key'].tolist()

### Growth Rate amongst hotspots

In [613]:
fig = go.Figure()
for place in top_hotspots[:5]:
    place_df = covid_us_df[covid_us_df['Combined_Key'] == place]
    fig.add_trace(go.Scatter(x=place_df['date'], y=place_df['Confirmed_Rate'], mode='lines', name=place))
    
fig.update_layout(
    autosize=False,
    template='ggplot2')
fig.show(renderer='iframe_connected')

### Confirmed Cases amongst hotspots

In [611]:
fig = go.Figure()
for place in top_hotspots[:5]:
    place_df = covid_us_df[covid_us_df['Combined_Key'] == place]
    fig.add_trace(go.Scatter(x=place_df['date'], y=place_df['Confirmed'], mode='lines', name=place))

    
fig.update_layout(
    autosize=False,
    template='ggplot2')
fig.show(renderer='iframe_connected')