# One full run through

- Run through the large data set
- Ensure full run with one click

In [3]:
import os
if os.path.split(os.getcwd())[-1] == 'notebooks':
    os.chdir("../")

'Your base path is at: ' + os.path.split(os.getcwd())[-1]

'Your base path is at: ads_covid-19'

## 1. Update all data

- close loop towards business delivery
- process should be a one click delivery
- move to large dataset

In [4]:
# %load src/data/get_data.py # Load the file directly!
# Open files
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

# Access websites
import requests
# Access json files
import json

def get_john_hopkins():
   # Running a process using 'git' to get the data from 'cwd'
   # shell == True the specified command will be executed through the shell.
   # stdin, stdout and stderr specify the executed program’s standard input, standard output and standard error file handles, respectively.
   # .communicate() writes input, reads all output, and waits for the subprocess to exit.
    git_pull = subprocess.Popen('/usr/bin/git pull' ,
    cwd = os.path.dirname('data/raw/COVID-19/'),
    shell = True,
    stdout = subprocess.PIPE,
    stderr = subprocess.PIPE)

    (out, error) = git_pull.communicate()

    print("Error: " + str(error))
    print("out: " + str(out))


def get_current_data_germany():
    data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object = json.loads(data.content)
    full_list = []
    for pos, each_dict in enumerate(json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list = pd.DataFrame(full_list)
    pd_full_list.to_csv('data/raw/NPGEO/GER_state_data.csv',sep=';')
    print('Number of region rows:' +str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_john_hopkins()
    get_current_data_germany()


Error: b'From https://github.com/CSSEGISandData/COVID-19\n   8bd11fbe..6e9880c5  master              -> origin/master\n * [new branch]        2638-Fix-State-FIPS -> origin/2638-Fix-State-FIPS\n   b534cb8b..81026de3  web-data            -> origin/web-data\n'
out: b'Updating 8bd11fbe..6e9880c5\nFast-forward\n README.md                                          |   11 +-\n csse_covid_19_data/README.md                       |   19 +-\n csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv   |  883 +--\n .../csse_covid_19_daily_reports/05-29-2020.csv     | 3523 +++++++++++\n .../csse_covid_19_daily_reports/05-30-2020.csv     | 3526 +++++++++++\n .../csse_covid_19_daily_reports/05-31-2020.csv     | 3527 +++++++++++\n .../csse_covid_19_daily_reports/06-01-2020.csv     | 3638 +++++++++++\n .../csse_covid_19_daily_reports/06-02-2020.csv     | 3642 +++++++++++\n .../csse_covid_19_daily_reports/06-03-2020.csv     | 3645 +++++++++++\n .../csse_covid_19_daily_reports/06-04-2020.csv     | 3646 +++++++++++

## Process pipeline

- process the dataset(ex:John Hopkins)

In [5]:
# %load src/data/process_JH_data.py
# Process the data
import pandas as pd
import numpy as np

from datetime import datetime

# Relational dataset : like a key:values pair
def store_relational_JH_data():
    data_path = 'data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)

    pd_data_base = pd_raw.rename(columns = {'Country/Region' : 'country',
    'Province/State' : 'state'})

    pd_data_base['state'] = pd_data_base['state'].fillna('no')

    pd_data_base = pd_data_base.drop(['Lat', 'Long'], axis = 1)

    # stack(): From columns to set_index
    pd_relational_model = pd_data_base.set_index(['state', 'country']) \
    .T \
    .stack(level = [0,1]) \
    .reset_index() \
    .rename(columns = {'level_0' : 'date',
    0 : 'confirmed'},
    )

    pd_relational_model['date'] = pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep = ';',index = False)
    print(' Number of rows stored: ' + str(pd_relational_model.shape[0]))

if __name__ == '__main__':
    store_relational_JH_data()


 Number of rows stored: 37772


## Filter and Doubling Rate Calculation

- bring in the regression function, doubling rate function etc.

In [6]:
# %load src/features/build_features.py
# %load src/features/build_features.py
# Linear regression models
import numpy as np
import pandas as pd
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept = True)

from scipy import signal

def get_doubling_time_via_regression(in_array):

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1,1)

    assert len(in_array) == 3
    reg.fit(X,y)
    intercept = reg.intercept_
    slope = reg.coef_

    return intercept/slope

# Whenever the Python interpreter reads a source file, it does two things:
# it sets a few special variables like __name__, and then
# it executes all of the code found in the file.
# It's as if the interpreter inserts this at the top
# of your module when run as the main program.
# __name__ == "__main__"
#if __name__ == '__main__':
    #test_data = np.array([2,4,6])
    #result = get_doubling_time_via_regression(test_data)
    #print('The test slope is: ' + str(result))

def savgol_filter(df_input, column ='confirmed', window = 5):
    ''' Filter the data'''
    degree = 1
    df_result = df_input

    filter_in = df_input[column].fillna(0) # Fill NA/NaN values using the specified method

    result = signal.savgol_filter(np.array(filter_in),
                                 window,
                                 degree)
    df_result[str(column+'_filtered')] = result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'''
    ''' Connected to -> get_doubling_time_via_regression'''
    days_back = 3
    result = df_input[col].rolling(
                window = days_back,
                min_periods = days_back).apply(get_doubling_time_via_regression, raw = False)
    return result

def calc_filtered_data(df_input, filter_on = 'confirmed'):
    ''' This function does all the merging of the new filtered data'''
    ''' Connected to -> savgol_filter'''
    # Set creates an unordered list of the given parameters
    must_contain = set(['state', 'country', filter_on])
    # Asserting whether state and country included in df_input
    assert must_contain.issubset(set(df_input.columns)), 'comment after a comma is accepted'

    df_output = df_input.copy()
    pd_filtered_result = df_output[['state', 'country', filter_on]].groupby(['state', 'country']).apply(savgol_filter)
    df_output = pd.merge(df_output, pd_filtered_result[[str(filter_on + '_filtered')]], left_index = True, right_index = True, how = 'left')
    return df_output.copy()

def calc_doubling_rate(df_input, filter_on = 'confirmed'):
    ''' Connected to -> rolling_reg'''
    must_contain = set(['state', 'country', filter_on])
    assert must_contain.issubset(set(df_input.columns)), 'comment after a comma is accepted'

    # Apply rolling_reg to the column 'confirmed' on states of countries
    pd_DR_result = df_input.groupby(['state', 'country']).apply(rolling_reg, filter_on).reset_index()

    pd_DR_result = pd_DR_result.rename(columns = {filter_on : filter_on+'_DR', 'level_2' :'index'})

    df_output = pd.merge(df_input, pd_DR_result[['index', str(filter_on + '_DR')]], left_index = True, right_on = ['index'], how = 'left')
    df_output = df_output.drop(columns = ['index'])

    return df_output

if __name__ == '__main__':
    test_data_reg = np.array([2,4,6])
    result = get_doubling_time_via_regression(test_data_reg)
    print('The test slope is: ' + str(result))

    pd_JH_data = pd.read_csv('data/processed/COVID_relational_confirmed.csv', sep = ';', parse_dates = [0])
    pd_JH_data = pd_JH_data.sort_values('date', ascending = True).copy()

    pd_result_larg = calc_filtered_data(pd_JH_data)
    pd_result_larg = calc_doubling_rate(pd_result_larg)
    # overwrites the parameter 'filter_on'
    pd_result_larg = calc_doubling_rate(pd_result_larg, 'confirmed_filtered')

    mask = pd_result_larg['confirmed'] > 100
    pd_result_larg['confirmed_filtered_DR'] = pd_result_larg['confirmed_filtered_DR'].where(mask, other = np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv', sep = ';', index = False)
    print(pd_result_larg[pd_result_larg['country'] == 'Germany'].tail())


The test slope is: [2.]
            date state  country  confirmed  confirmed_filtered  confirmed_DR  \
20585 2020-06-07    no  Germany   185750.0            185747.8    448.849072   
20586 2020-06-08    no  Germany   186109.0            186067.4    563.792615   
20587 2020-06-09    no  Germany   186506.0            186315.6    492.385362   
20588 2020-06-10    no  Germany   186522.0            186545.1    902.561743   
20589 2020-06-11    no  Germany   186691.0            186774.6   2017.005405   

       confirmed_filtered_DR  
20585             460.825626  
20586             511.340125  
20587             655.313843  
20588             780.026656  
20589             812.832680  


## Visual Board

- dash board with full list of countries (>=100)
- all features(ex.:filtered data, doubling rate) from the data needs to be shown in dashboard
- graph starts with 100 confirmed cases

In [8]:
# %load ../src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

import plotly.graph_objects as go

import os
# Get current working directory
print(os.getcwd())
df_input_large = pd.read_csv('data/processed/COVID_final_set.csv',sep=';')

fig = go.Figure()
app = dash.Dash()
app.layout = html.Div([ # In Markdown one can add text for the dashboard
dcc.Markdown('''
# Applied Data Science on COVID-19 data
Goals of the project:
- teach data science by applying a cross industry standard process
- automated data gathering, data transformations, filtering
- apply machine learning to approximate the doubling Timeline
- static deployment of the dashboard


'''),

dcc.Markdown('''
## Select different countries for visualization
'''),

dcc.Dropdown(# This is the selection area of the countries
id = 'country_drop_down',
options = [{'label' : each, 'value' : each} for each in df_input_large['country'].unique()],# check if countries are not repeated
value = ['US', 'Germany', 'Italy'],# pre-selected
multi = True # Many options can be chosen
),

dcc.Markdown('''
## Features
'''),

dcc.Dropdown(
id = 'doubling_time',
options = [
{'label' : 'Timeline Confirmed', 'value' : 'confirmed'},
{'label' : 'Timeline Confirmed Filtered', 'value' : 'confirmed_filtered'},
{'label' : 'Timeline Doubling Rate', 'value' : 'confirmed_DR'},
{'label' : 'Timeline Doubling Rate Filtered', 'value' : 'confirmed_filtered_DR'},
],
value = 'confirmed',
multi = False
),
dcc.Graph(figure = fig, id = 'main_window_slope')
])

@app.callback(
Output('main_window_slope', 'figure'), # property -> figure
[Input('country_drop_down', 'value'), # property -> value
Input('doubling_time', 'value')])

def update_figure(country_list, show_doubling):
    if 'confirmed_DR' in show_doubling:
        my_yaxis = {'type' : "log",
        'title' : 'Approximated doubling rate over 3 days (larger numbers are better)'
        }
    else:
        my_yaxis = {'type' : "log",
        'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
        }

    traces = [] # creating a list
    for each in country_list:
        df_plot = df_input_large[df_input_large['country'] == each]

        if show_doubling == 'confirmed_filtered_DR':
            # aggregate per date and country
            df_plot = df_plot[['state', 'country', 'confirmed', 'confirmed_filtered', 'confirmed_DR', 'confirmed_filtered_DR', 'date']].groupby(['country', 'date']).agg(np.mean).reset_index()
        else: # confimred data -> therefore add all the cases
            df_plot = df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()

        traces.append(dict(x = df_plot.date,
                           y = df_plot[show_doubling],
                           mode = 'markers+lines',
                           opacity = 0.9,
                           name = each,
                          )
                       )
    return {
    'data' : traces,
    'layout' : dict(
    width = 1280,
    height = 720,

    xaxis={'title':'Timeline',
            'tickangle':-45,
            'nticks':20,
            'tickfont':dict(size=14,color="#7f7f7f"),
          },
    yaxis = my_yaxis
    )
}


if __name__ == '__main__':
    # set debug to true to ensure we don't have to keep refreshing the server every time we make some changes
    app.run_server(debug=True, use_reloader=False)


/Users/vimstan/Desktop/Data Science/Vimstan_Covid19/ads_covid-19
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 066-136-227
Debugger PIN: 066-136-227
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
