# Analysis of the COVID-19 Data Set published by John Hopkins University Center for Systems Science and Engineering (JHU CSSE)
### The data is collected and distribued daily.
### Anyone can retrieve the full data set from this [GitHub](https://github.com/CSSEGISandData/COVID-19) site.
### The primary data files types are:
* Summary Files (Province/State, Country/Region, Last Update, Confirmed, Deaths, Recovered, Latitude, Longitude)
* Time Series Files (Province/State, Country/Region, Lat, Long, <dates>) for Confirmed, Deaths, and Recovered counts

In [1]:
# Initialization
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib as p
import time as t
import scipy.stats as st

# Set this path to the directory where you have cloned the JHU CSSE Data Set
dir_covid_19 = p.PureWindowsPath(r"C:\Users\norma\HDD_Documents\BootCamp\Projects\Project1\CoronaVirus\COVID-19")
world_path = p.PureWindowsPath(r"Resources\world_pop_csv.csv")
# Set this date to the desired Summary Date
study_date = dt.date(2020, 3, 13)

## Summary Data
### Load the Summary File for the Study Date

In [2]:
# Daily Summary Statistics
mmddyyyy_study_date = study_date.strftime("%m-%d-%Y")
daily_report_filename = p.Path(dir_covid_19) / f"csse_covid_19_data/csse_covid_19_daily_reports/{mmddyyyy_study_date}.csv"
type(daily_report_filename)
ss = pd.read_csv(daily_report_filename)

FileNotFoundError: [Errno 2] File C:\Users\norma\HDD_Documents\BootCamp\Projects\Project1\CoronaVirus\COVID-19\csse_covid_19_data\csse_covid_19_daily_reports\03-13-2020.csv does not exist: 'C:\\Users\\norma\\HDD_Documents\\BootCamp\\Projects\\Project1\\CoronaVirus\\COVID-19\\csse_covid_19_data\\csse_covid_19_daily_reports\\03-13-2020.csv'

In [None]:
ss[ss['Country/Region'] == 'Italy']

### Aggregate the Counts by Country

In [None]:
# Aggregate the summary data by country
ss_country = pd.pivot_table(ss, values=["Confirmed","Deaths","Recovered"], index="Country/Region", columns=None, aggfunc="sum")
ss_country

## Time Series Data
### Load a time series of Confirmed case counts
###### (The same logic could be used for Death or Recovered counts.)

In [None]:
# Time Series -- Confirmed
time_series_confirmed_filename = p.Path(dir_covid_19) / "csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
#"C:\Users\norma\HDD_Documents\BootCamp\Projects\Project1\CoronaVirus\COVID-19\csse_covid_19_data\csse_covid_19_time_series\time_series_19-covid-Confirmed.csv"
tsc = pd.read_csv(time_series_confirmed_filename)
tsc_country = pd.pivot_table(tsc, index="Country/Region", aggfunc="sum").drop(columns=["Lat","Long"])
# Transpose the DataFrame, because Pandas likes the dates in the index
tsc_country = tsc_country.T

In [None]:
# Pandas is loading the column headings (now index values), which look like "mm/dd/yy" dates, as strings.
# The string representation of these dates don't sort chronologically.
# Change the string index to Timestamp index, which Pandas will treat as a DateTimeIndex (an index with extra features).
tsc_country.index = [pd.Timestamp(d) for d in tsc_country.index]
# Now sort the DataFrame by the Timestamp index
tsc_country = tsc_country.loc[tsc_country.index.sort_values(),:]
tsc_country

### Align the trends
* Create a DataFrame with Relative Date Counts.
* Remove the leading zero counts, to align the trends to start with the first non-zero value.

In [None]:
# Create an DataFrame full of NaN, to hold relative-day trends of infection for each country
npNaN = np.empty(tsc_country.shape)
npNaN[:] = np.NaN
npNaN.shape
rel_country = pd.DataFrame(npNaN, columns=tsc_country.columns.values)

In [None]:
# Shift the trends up, to start on relative day zero
for (the_column, the_data) in tsc_country.iteritems():
#     print(type(the_column), type(the_data),the_data)
    trend = np.trim_zeros(the_data.to_numpy(),'f')
    rel_country[the_column][0:len(trend)] = trend
#     break
    
rel_country.to_csv("output\diff_T_df.csv", index = False)
rel_country

In [None]:
diff_confirm_df = rel_country.diff(axis = 0)
# diff_confirm_df.to_csv("output\avg_diff.csv",index = True)
diff_confirm_df

In [None]:
avg_change_df = pd.DataFrame(diff_confirm_df.mean(axis = 0)).reset_index()
avg_change_df.columns = ["country","average"]
avg_change_df.to_csv("output/avg_change.csv", index = True)
avg_change_df

In [None]:
world_pop_df = pd.read_csv(r"Resources\world_pop_utf.csv", encoding = 'UTF-8')

world_pop_df.to_csv("output/world_pop.csv", index = False)
world_pop_df.columns
world_pop_df

In [None]:
countries_of_interest = ['China','Italy','Korea, South','Iran','France','US']
# world_pop_df has "South Korea" and "United States"
world_pop_df[world_pop_df['Country '] == 'United States']

avg_change_df[avg_change_df['country']=='US']
avg_change_df.loc[144,'country'] = 'United States'
avg_change_df.loc[144,:]
avg_change_df[avg_change_df['country']=='Korea, South']
avg_change_df.loc[77,'country'] = 'South Korea'
avg_change_df.loc[77,:]
#avg_change_df[avg_change_df['country']=='Korea, South'] = 'South Korea'
#avg_change_df[avg_change_df['country']=='United States']
#avg_change_df[avg_change_df['country']=='South Korea']


In [None]:
scatterPlt_df = pd.merge(avg_change_df,world_pop_df, left_on = 'country', right_on = 'Country ')
scatterPlt_df.drop(['Country '], axis = 1, inplace = True)
scatterPlt_df.head(5)

In [None]:
# normalize the population

# scatterPlt_df['normalize'] = (scatterPlt_df["world Population 2020"]*100000)/100
scatterPlt_df.dtypes
# scatterPlt_df.head()

In [None]:
scatterPlt_df['world Population 2020'] = scatterPlt_df['world Population 2020'].str.replace(',','').astype(float)
scatterPlt_df

In [None]:
scatterPlt_df.dtypes

In [None]:
scatterPlt_df['normalize'] = (scatterPlt_df["world Population 2020"]/1000000)
print(scatterPlt_df[scatterPlt_df['country']=='India'])
print(scatterPlt_df[scatterPlt_df['country']=='China'])
print(scatterPlt_df[scatterPlt_df['average']>500.0])

In [None]:
# Scatter Plot x = Average Confirmed cased per day, y = Population

x_values = scatterPlt_df['normalize'].sort_values()
y_values = scatterPlt_df['average'].sort_values()
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
x_regression = np.arange(14,1500.0,0.5)
regress_values = x_regression * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

fig1 = plt.figure(1, figsize=(15,9))
plt.xlim(.2e1,1.5e3)
plt.xscale("log")
plt.ylim(1e-1,1.7e3)
plt.yscale("log")
plt.scatter(scatterPlt_df['normalize'],scatterPlt_df['average'])
plt.plot(x_regression,regress_values,"r-")

plt.annotate(f"Pearson's Correlation is {rvalue:.2f}", (120,50), color="red")
plt.annotate(line_eq,(120,70),color="red")
plt.annotate("India",(1100,2.8))
plt.annotate("China",(1110,1300))
plt.annotate("Iran",(89,580))
plt.annotate("Italy",(64,700))

#43    Iran   598.777778           8.399295e+07    83.992949
#47   Italy   684.869565           6.046183e+07    60.461826


plt.title("Average Cases Per Day v. Country Population")
plt.ylabel("Average Cases Per Day (logrithmic)")
plt.xlabel("Country Population (mm, logrithmic)")

plt.savefig("./CasesPerDayForPopulation.png")
# The "_ = " catches the return value of the last command, so it doesn't distract from the figure

In [None]:
import seaborn as sns


In [None]:
country_interest = ['China','Italy','Iran','South Korea']

country_interest_df = scatterPlt_df.loc[scatterPlt_df['country'].isin(country_interest)]
sns.scatterplot(country_interest_df['average'],country_interest_df['normalize'],hue = country_interest_df['country'])
plt.legend()

In [None]:


diff_confirm_df = diff_confirm_df.reindex(country_interest, axis =1)

_ = sns.boxplot(data = diff_confirm_df)
plt.title("Daily Growth Rate for Countries of Interest")
plt.ylabel("Daily Growth")