## One Run full walkthrough - SIR virus spread model for COVID around the world

* Do the full walkthrough on the large data set
* Refactor the source code and bring it to the individual scripts
* Ensure a full run with one click

# 1 Update all data

In [1]:
# %load C:\Users\Asus\ads_covid-19\src\data\get_data.py
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chidr("../")

def get_johns_hopkins():
    git_pull = subprocess.Popen("git pull" ,
                                cwd = os.path.dirname('C:/Users/Asus/ads_covid-19/data/raw/COVID-19/'),
                                shell = True,
                                stdout = subprocess.PIPE,
                                stderr = subprocess.PIPE)
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')
    
    # 400 regions/ Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')
    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])
        
        pd_full_list=pd.DataFrame(full_list)
        pd_full_list.to_csv('C:/Users/Asus/ads_covid-19/data/raw/NPGEO/GER_state_data.csv',sep=':')
        #print('Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()
    

Error : b''
out : b'Already up to date.\n'


# 2 Process Pipeline

In [2]:
# %load C:\Users\Asus\ads_covid-19\src\data\process_JH_data_SIR.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_large_flat_JH_data():
    ''' Transforms the raw COVID data into a large flat table structure
    
    '''
    
    datapath='C:/Users/Asus/ads_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(datapath)
    
    time_index=pd_raw.columns[4:]
    pd_flat_table=pd.DataFrame({'date':time_index})
    
    country_list=pd_raw['Country/Region'].unique()
    
    for country in country_list:
        pd_flat_table[country]=np.array(pd_raw[pd_raw['Country/Region']==country].iloc[:,4::].sum(axis=0))
        
        
    time_idx=[datetime.strptime( each, "%m/%d/%y") for each in pd_flat_table.date]
    time_str=[each.strftime('%Y-%m-%d') for each in time_idx]
    
    pd_flat_table['date']=time_index
    
    pd_flat_table.to_csv('C:/Users/Asus/ads_covid-19/data/processed/COVID_large_flat_table.csv',sep=';',index=False )
    print('Latest date is'+str(max(pd_flat_table.date)))
    print(' Number of rows stored: '+str(pd_flat_table.shape[0]))

if __name__ == '__main__':
    store_large_flat_JH_data()



Latest date is9/9/20
 Number of rows stored: 237


# 3 SIR calculation

In [3]:
# %load C:\Users\Asus\ads_covid-19\src\features\build_features_SIR.py
import pandas as pd
import numpy as np
from scipy import optimize
from scipy import integrate

df_large_flat=pd.read_csv('C:/Users/Asus/ads_covid-19/data/processed/COVID_large_flat_table.csv',sep=';').iloc[80:]

df_list = df_large_flat.columns
df_list = list(df_list)

# Functions for SIR model
def SIR_model_t(SIR,t,beta,gamma):
    ''' Simple SIR model
        S: susceptible population
        t: time step, mandatory for integral.odeint
        I: infected people
        R: recovered people
        beta: 
        
        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)
    
    '''
    
    S,I,R=SIR
    dS_dt=-beta*S*I/N0          #S*I is the 
    dI_dt=beta*S*I/N0-gamma*I
    dR_dt=gamma*I
    return dS_dt,dI_dt,dR_dt

#Function defined for optimize curve fit

def fit_odeint(x, beta, gamma):
    '''
    helper function for the integration
    '''
    return integrate.odeint(SIR_model_t, (S0, I0, R0), t, args=(beta, gamma))[:,1] # we only would like to get dI

#Fitting parameter for SIR model
for each in df_list[1:]:
    ydata = np.array(df_large_flat[each])
    t=np.arange(len(ydata))
    
    N0 = 10000000 #max susceptible population 
    I0=ydata[0]
    S0=N0-I0
    R0=0

    popt, pcov = optimize.curve_fit(fit_odeint, t, ydata, maxfev = 1000)
    perr = np.sqrt(np.diag(pcov))

    # get the final fitted curve
    fitted=fit_odeint(t, *popt).reshape(-1,1)
    df_large_flat[each +'_fitted'] = fitted 
    
df_large_flat.to_csv('C:/Users/Asus/ads_covid-19/data/processed/COVID_large_fitted_table.csv', sep=';',index=False)



# 4 Visual Board

In [4]:
# %load C:\Users\Asus\ads_covid-19\src\visualization\visualize_SIR.py
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input,Output

import os
print(os.getcwd())
df_input_large=pd.read_csv('C:/Users/Asus/ads_covid-19/data/processed/COVID_large_flat_table.csv',sep=';',parse_dates=[0])
df_input_large=df_input_large.sort_values('date',ascending=True)
df_input_SIR=pd.read_csv('C:/Users/Asus/ads_covid-19/data/processed/COVID_large_fitted_table.csv',sep=';')
df_input_SIR=df_input_SIR.sort_values('date',ascending=True)

fig=go.Figure()
app=dash.Dash()

app.layout=html.Div([
        dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

    dcc.Markdown('''
    ## Select a Country for Visualization of a Simulated SIR Curve
    '''),

        
        dcc.Dropdown( id='country_drop_down_simulated_list',
                     options=[{'label':each,'value':each} for each in df_input_large.columns[1:]],
                     value='Germany',
                     multi=False),
    
    #Manipulating the values of beta ,gamma, t_initial, t_intro_measures,t_hold,t_relax to achieve the simulated curve

    dcc.Markdown('''
        ## Vary the different values to reshape the SIR curve(Enter a number and press Enter)
        '''),
    
    html.Label(["Infection rate in days, when no measure introduced",             
    dcc.Input(
    id='t_initial',
    type='number',
    value=28,debounce=True)]),

    html.Br(),
    html.Br(),

    html.Label(["Infection rate in days, when measure introduced",             
    dcc.Input(
    id='t_intro_measures',
    type='number',
    value=14,debounce=True)]),

    html.Br(),
    html.Br(),

    html.Label(["Infection rate in days, when measure sustained/held",             
    dcc.Input(
    id='t_hold',
    type='number',
    value=21,debounce=True)]),

    html.Br(),
    html.Br(),

    html.Label(["Infection rate in days, when measure relaxed/removed",             
    dcc.Input(
    id='t_relax',
    type='number',
    value=21,debounce=True)]),
    

    html.Br(),
    html.Br(),
    
    html.Label(["Beta max",             
    dcc.Input(
    id='beta_max',
    type='number',
    value=0.4,debounce=True)]),

    html.Br(),
    html.Br(),
    
    html.Label(["Beta min",
    dcc.Input(
    id='beta_min',
    type='number',
    value=0.1,debounce=True)]),

    html.Br(),
    html.Br(),
    
    html.Label(["Gamma",             
    dcc.Input(
    id='gamma',
    type='number',
    value=0.1,debounce=True)]),

    html.Br(),
    html.Br(),
    
    dcc.Graph(figure=fig, id='SIR_simulated', animate=False,),
    
    dcc.Markdown('''
    ## Select a Country for Visualization of a Fitted SIR Curve
    '''),
    
    dcc.Dropdown( id='country_drop_down_fitted_list',
                     options=[{'label':each,'value':each} for each in df_input_SIR.columns[1:]],
                     value='Germany',
                     multi=False),
    
    dcc.Graph(id='SIR_fitted', animate=False,)
    
        ])
        
    
@app.callback(
    Output('SIR_simulated', 'figure'),
    [Input('country_drop_down_simulated_list', 'value'),
    Input('t_initial','value'),
    Input('t_intro_measures','value'),
    Input('t_hold','value'),
    Input('t_relax','value'),
    Input('beta_max','value'),
    Input('beta_min','value'),
    Input('gamma','value')])
    
def update_figure(country,t_initial, t_intro_measures, t_hold, t_relax, beta_max, beta_min, gamma):
    ydata=df_input_large[country][df_input_large[country]>=30]
    xdata=np.arange(len(ydata))
    N0=10000000
    I0=30
    S0=N0-I0
    R0=0
    gamma    
    SIR=np.array([S0,I0,R0])
    
    t_initial
    t_intro_measures
    t_hold
    t_relax
    beta_max
    beta_min
    propagation_rates=pd.DataFrame(columns={'susceptible':S0,'infected':I0,'recovered':R0})
    pd_beta=np.concatenate((np.array(t_initial*[beta_max]),
                       np.linspace(beta_max,beta_min,t_intro_measures),
                       np.array(t_hold*[beta_min]),
                       np.linspace(beta_min,beta_max,t_relax),
                       ))
    
    def SIR_model(SIR,beta,gamma):
        'SIR model for simulatin spread'
        'S: Susceptible population'
        'I: Infected popuation'
        'R: Recovered population'
        'S+I+R=N (remains constant)'
        'dS+dI+dR=0 model has to satisfy this condition at all time'
        S,I,R=SIR
        dS_dt=-beta*S*I/N0
        dI_dt=beta*S*I/N0-gamma*I
        dR_dt=gamma*I
        return ([dS_dt,dI_dt,dR_dt])
    
    for each_beta in pd_beta:
        new_delta_vec=SIR_model(SIR,each_beta,gamma)
        SIR=SIR+new_delta_vec
        propagation_rates=propagation_rates.append({'susceptible':SIR[0],'infected':SIR[1],'recovered':SIR[2]},ignore_index=True) 
    
    fig=go.Figure()
    fig.add_trace(go.Bar(x=xdata,
                        y=ydata,
                         marker_color='red',
                         name='Confirmed Cases'                
                        ))
    
    fig.add_trace(go.Scatter(x=xdata,
                            y=propagation_rates.infected,
                            mode='lines',
                            marker_color='blue',
                            name='Simulated curve'))
    
    fig.update_layout(shapes=[
                            dict(type='rect',xref='x',yref='paper',x0=0,y0=0,x1=t_initial,y1=1,fillcolor="midnightblue",opacity=0.3,layer="below"),
                            dict(type='rect',xref='x',yref='paper',x0=t_initial,y0=0,x1=t_initial+t_intro_measures,y1=1,fillcolor="midnightblue",opacity=0.4,layer="below"),
                            dict(type='rect',xref='x',yref='paper',x0=t_initial+t_intro_measures,y0=0,x1=t_initial+t_intro_measures+t_hold,y1=1,fillcolor="midnightblue",opacity=0.5,layer='below'),
                            dict(type='rect',xref='x',yref='paper',x0=t_initial+t_intro_measures+t_hold,y0=0,x1=t_initial+t_intro_measures+t_hold+t_relax,y1=1,fillcolor="midnightblue",opacity=0.6,layer='below')
                            ],
                    title='SIR Simulation Scenario',
                    title_x=0.5,
                    xaxis=dict(title='Timeline',
                               titlefont_size=16),
                    yaxis=dict(title='Confirmed infected people (source johns hopkins csse, log-scale)',
                               type='log',
                                titlefont_size=16,
                              ),
                    width=1600,
                    height=900,
                     )
    return fig

@app.callback(
    Output('SIR_fitted', 'figure'),
    [Input('country_drop_down_fitted_list', 'value')])
    
    
def SIR_figure(country_list):
    df_SIR= df_input_SIR
    
    for n in df_SIR[1:]:
        data = []
        trace = go.Scatter(x=df_SIR.date,
                           y=df_SIR[country_list],
                           mode='lines+markers',
                           name = country_list)
        data.append(trace)
        
        trace_fit = go.Scatter(x=df_SIR.date,
                                  y=df_SIR[country_list +'_fitted'], 
                                  mode='lines+markers',
                                  name=country_list+'_fitted')
        data.append(trace_fit)
        
        
            
    return {'data': data,
            'layout' : dict(
                width=1600,
                height=900,
                title= 'SIR Fitted Curve',
                xaxis={'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },
                yaxis={'type':"log"
                      }
                
            )
        } 

if __name__ == '__main__':
    app.run_server(debug=True,use_reloader=False)

D:\MS in Germany\CVT\Study\SS20\Enterprise_Data_Science
Running on http://127.0.0.1:8050/
Debugger PIN: 267-348-341
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
