# Data Analysis - covid19

* Data download
* Data wrangling
* Data plot

### Remarks:
* convert DateRep to pd date format
* set 'Date' as index

In [None]:
 import sys
 print(sys.executable)
 print(sys.version)
 print(sys.version_info)

from platform import python_version

print(python_version())


import numpy as np
import pandas as pd
import os
import datetime
import sys
#from urllib.error import URLerror
import urllib.request
from urllib.error import HTTPError
from colorama import Fore

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


## Requesting & preprocessing Data

1. Data Download
2. Data import
3. Data wrangling


In [None]:
# ----------------------- Download file -----------------------
# format: 
# https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-03-22.xlsx

today = '2020-06-29'

def check_directory(cwd):
    return os.access(cwd, os.F_OK or os.X_OK or os.R_OK | os.W_OK) 

def check_existence(file_name):
    return os.path.isfile(file_name)


def get_link_and_filename():
    prefix = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-'
    file_ending = '.xlsx'
    file_date = datetime.datetime.now().strftime("%Y-%m-%d")
    
    link_str = prefix + file_date + file_ending
    file_name = "COVID-19-geographic-disbtribution-worldwide-" + file_date + file_ending
    # print(link_str)
    # print(file_name)
    return link_str, file_name

def get_link_and_filename_yesterday():
    prefix = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-'
    file_ending = '.xlsx'
    file_date_yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
    file_date = file_date_yesterday.strftime("%Y-%m-%d")
    
    link_str = prefix + file_date + file_ending
    file_name = "COVID-19-geographic-disbtribution-worldwide-" + file_date + file_ending
    print(link_str)
    print(file_name)
    return link_str, file_name




def file_yesterday():
    file_date_yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
    
    file_name_yesterday = "COVID-19-geographic-disbtribution-worldwide-"+ file_date_yesterday.strftime("%Y-%m-%d") + ".xlsx"
    return file_name_yesterday
    
print(get_link_and_filename_yesterday())
    
def file_download():
    
    cwd = os.getcwd()
    link, file_name = get_link_and_filename()
    link_yest, file_yest = get_link_and_filename_yesterday()
        
    if check_directory(cwd):
        print(f"Work directory ( {cwd} ) is OK")
        if check_existence(file_name):
            print(f"File ( {file_name} ) already exists")
        else:
            try:
                urllib.request.urlretrieve(link, file_name)
            except urllib.error.URLError as err:
                print(f"{Fore.RED}Error occured (maybe data not online yet) " 
                      + Fore.BLUE + f"ERROR CODE: {err.code}")
                try:
                    urllib.request.urlretrieve(link_yest, file_yest)
                    print()
                except urllib.error.URLError as err:
                    print(f"{Fore.RED}Error occured (maybe data not online yet) " 
                      + Fore.BLUE + f"ERROR CODE: {err.code}")
                
                
                    

        
get_link_and_filename()
file_download()

In [None]:
# ----------------------- Read file / create pandas dataframe -----------------------
    
cwd = os.getcwd()
link, file_name = get_link_and_filename()

# instance on xlsx file
if check_existence(file_name):
    try:
        xlsx = pd.ExcelFile(file_name)
    except FileNotFoundError as err:
        print(f"{Fore.RED}Error occured: {Fore.BLACK}{err}")
else:
    xlsx = pd.ExcelFile(file_yesterday())
    

# read instance to pandas dataframe
df = pd.read_excel(xlsx, parse_dates=True)

# convert date to correct pd datetime format
df['dateRep'] = pd.to_datetime(df.dateRep)

# set date as index
df.set_index('dateRep', inplace=True)

# change order for timeseries
df.sort_values(by='dateRep')

In [None]:
# ----------------------- Data wrangling -----------------------
# net necessary yet !?!

# check data
index = df.index
columns = df.columns
values = df.values

display(type)
display(columns)
display(index)
display(values)
# display(type(df['DateRep']))
# df['DateRep'].dtype

## Plotting Data

* TODO:
    * implement date selection and autofill
    * functions for functionality 
    * Show various statistics

### Plotting Timeseries

### Plotting cumulative sum 

In [None]:
# ----------------------- Read file -----------------------

# TODO:
#  - choose date and fill in everywhere automatically
#  - Automatically show statistics


df_aut = df.loc[df['countriesAndTerritories']=='Austria']
df_it = df.loc[df['countriesAndTerritories']=='Italy']
df_aut = df_aut.sort_index(ascending=True)
df_it = df_it.sort_index(ascending=True)
df_aut_march = df_aut.loc['2020-03-01':today]
df_it_march = df_it.loc['2020-03-01':today]
    

# show some data
aut_cumsum_cases = df_aut.cumsum().loc[df_aut['month']>1]['cases']
aut_cumsum_deaths = df_aut.cumsum().loc[df_aut['month']>1]['deaths']
# print("Cases in Austria: ", aut_cumsum_cases)
# print("Deaths in Austria: ", aut_cumsum_deaths)


# Plots
ax = df_aut_march.plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Austria - daily data")

ax = df_it_march.plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Italy - daily data")

ax = df_aut_march.cumsum().plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Austria - cumulative sum ")

ax = df_it_march.cumsum().plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Italy - cumulative sum ")

In [None]:
#  USA-Data
df_usa = df.loc[df['countriesAndTerritories']=='United_States_of_America']
df_usa = df_usa.sort_index(ascending=True)
df_usa_all = df_usa.loc['2020-03-01' :'2020-06-29']

# Plot USA
ax = df_usa_all.plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.grid(True)
ax.grid(which='minor', axis='both', linestyle='--')
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("US - daily data")

ax = df_usa_all.cumsum().plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("US - cumulative sum ")


# Sweden-Data
df_swe = df.loc[df['countriesAndTerritories']=='Sweden']
df_swe = df_swe.sort_index(ascending=True)
df_swe_all = df_swe.loc['2020-03-01' :'2020-06-29']

# Sweden Plot
ax = df_swe_all.plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Sweden - daily data")

ax = df_swe_all.cumsum().plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Sweden - cumulative sum ")


In [None]:
print(file_yesterday())

In [None]:
cases = df_aut.loc[df_aut['month']>1]['cases']
for c in cases:
    print(c)

print()

print("Cum Sum: ")
cumsum= df_aut.loc[df_aut['month']>1]['cases'].cumsum()
print(type(cumsum))
print(cumsum[50])

count = 1;
for i in cumsum:
    count += 1
    print(i)
    

    
display("count: " + str(count))


In [None]:
#count = 0
# print(cumsum.size)
# print(cumsum[cumsum.size-1])
#print()

for i in range(0, cumsum.size, 1):
    a = cumsum[i-1]
    b = cumsum[i]
    relativeGrowth = (b-a)/a*100
    print(a, b, relativeGrowth)


In [None]:
# Germany
df_ger = df.loc[df['countriesAndTerritories']=='Germany']
df_ger = df_ger.sort_index(ascending=True)
df_ger_today = df_ger.loc['2020-02-01':'2020-06-29']

ax = df_ger_today.plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Germany - daily data")

ax = df_ger_today.cumsum().plot(y=['cases','deaths'], grid=True, marker=11, figsize=(14,9))
ax.set_xlabel("Date")
ax.set_ylabel("Count")
ax.set_title("Germany - cumulative sum ")