# 1) Update all data

In [None]:
# %load ../src/data/get_data.py
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json
import git

# # Data Understanding
# 
# ## Data Sources
# * RKI, webscraping https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
# * John Hopkins (GIT) https://github.com/CSSEGISandData/COVID-19.git
# * Rest API to retrieve covid data from NPGEO https://npgeo-corona-npgeo-de.hub.arcgis.com/
# 

# ###  John Hopkins Source

def get_johns_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    # git_pull = subprocess.Popen( "/usr/bin/git pull" ,
    #                      cwd = os.path.dirname( 'data/raw/COVID-19/' ),
    #                      shell = True,
    #                      stdout = subprocess.PIPE,
    #                      stderr = subprocess.PIPE )
    # (out, error) = git_pull.communicate()


    # print("Error : " + str(error))
    # print("out : " + str(out))

    g = git.cmd.Git(r'C:\Users\SURAJ BHAT\Desktop\eds-covid19\data\raw\COVID-19')
    msg = g.pull()
    print(msg)


# ## REST API CALLS


## data request for Germany

def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv(r'C:\Users\SURAJ BHAT\Desktop\eds-covid19\data\raw\NPGEO\GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == "__main__":
    get_johns_hopkins()
    get_current_data_germany()


# 2) Process Pipeline

In [1]:
# %load ../src/data/Process_JohnHopkins_data.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data_confirmed():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))

def store_relational_JH_data_deaths():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'deaths'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('../data/processed/COVID_relational_deaths.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))

def store_relational_JH_data_recovered():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'recovered'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('../data/processed/COVID_relational_recovered.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
 

def store_confimed_data_for_sir():
    data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)
    pd_raw = pd_raw.drop(['Lat','Long','Province/State'],axis=1)
    pd_raw = pd_raw.rename(columns={'Country/Region':'country'})
    pd_flat_table = pd_raw.set_index('country') \
                    .T \
                    .stack(level=[0]) \
                    .reset_index() \
                    .rename(columns={'level_0':'date',
                                    0:'confirmed'}
                                    )
    pd_flat_table['date'] = pd_flat_table.date.astype('datetime64[ns]')
    pd_flat_table = pd.pivot_table(pd_flat_table, values='confirmed', index='date', columns='country', aggfunc=np.sum, fill_value=0).reset_index()
    pd_flat_table.to_csv('../data/processed/COVID_full_flat_table.csv',sep=';',index = False)
    #print(pd_flat_table.tail())
    print('Number of rows stored - Full Flat Table: '+str(pd_flat_table.shape[0]))

if __name__ == '__main__':

    store_relational_JH_data_confirmed()
    store_relational_JH_data_deaths()
    store_relational_JH_data_recovered()
    store_confimed_data_for_sir()

 Number of rows stored: 62510
 Number of rows stored: 62510
 Number of rows stored: 59455
Number of rows stored - Full Flat Table: 235


# 3) Filter and Doubling Rate Calculation

In [5]:
# %load ../src/features/build_features.py
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    #print(y)
    X = np.arange(-1,2).reshape(-1, 1)
    #print(X)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_


    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function 
        it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    
    filter_in=df_input[column].fillna(0) # attention with the neutral element here
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result


def rolling_reg(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()

def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))
    pd_JH_data=pd.read_csv('..\data\processed\COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    #test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
    #                  (pd_JH_data['country']=='Germany'))]

    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')
    print(pd_result_larg.head())

    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)

the test slope is: [2.]
            date    state       country  confirmed  confirmed_filtered  \
0     2020-01-22  Alberta        Canada        0.0                 0.0   
39715 2020-01-22       no  Korea, South        1.0                 0.8   
39950 2020-01-22       no        Kosovo        0.0                 0.0   
40185 2020-01-22       no        Kuwait        0.0                 0.0   
40420 2020-01-22       no    Kyrgyzstan        0.0                 0.0   

       confirmed_DR  confirmed_filtered_DR  
0               NaN                    NaN  
39715           NaN                    NaN  
39950           NaN                    NaN  
40185           NaN                    NaN  
40420           NaN                    NaN  


# 4) SIR MODEL

In [3]:
# %load ../src/models/SIR model.py
import pandas as pd
import numpy as np
from scipy import optimize
from scipy import integrate
import warnings

'''
    Default paramter initializations
'''
N0 = 5000000
I0 = 20   #Infected population
S0 = N0 - I0    #Suspected population
R0 = 0          #Recovered population
beta = 0.4      #Rate of infection
gamma = 0.1     #Rate of recovery
t = 0
def Handle_SIR_Modelling(ydata):
    global t
    t = np.arange(len(ydata))
    global I0
    I0 = ydata[0]   #Infected population
    popt, pcov = optimize.curve_fit(fit_odeint, t, ydata, maxfev=10000)
    fitted = fit_odeint(t, *popt)
    return t, ydata, fitted

def SIR_model_fit(SIR, time, beta, gamma):
    '''
    Simple SIR model implementation.
    S: Suspected population
    I: Infected population
    R: Recovered population
    beta: rate of infection
    gamma: rate of recovery
    time: for integral as define in odeint function of scipy.integrate
    as per slides: ds+dI+dR = 0 and S+R+I=N (total population)
​
    Make a note tht in this model a recovered person can not get infected again.
    '''

    S,I,R = SIR
    dS_dt = -beta*S*I/N0
    dI_dt = beta*S*I/N0 - gamma*I
    dR_dt = gamma*I

    return dS_dt, dI_dt, dR_dt

def fit_odeint(x, beta, gamma):
    ''' To call integrate funtion of scipy'''
    return integrate.odeint(SIR_model_fit, (S0, I0, R0), t, args=(beta, gamma))[:,1]  #we are only fetching dI

if __name__ == '__main__':
    print('SIR Modelling Started.')
    warnings.filterwarnings('ignore')
    df_analyse = pd.read_csv('../data/processed/COVID_full_flat_table.csv', sep=';')
    df_analyse.sort_values('date', ascending=True)
    df_analyse = df_analyse.drop(['date'],axis=1)
    df_SIR_model = pd.DataFrame()
    start_count = 0
    total_rows = len(df_analyse.US)
    for each_country in df_analyse:
        #if each_country == 'Germany':
        #print(each_country)
        ydata = np.array(df_analyse[each_country][35:])
        t, ydata, fitted = Handle_SIR_Modelling(ydata)
        df_SIR_model[each_country] = fitted
    df_SIR_model.to_csv('../data/processed/COVID_SIR_Model_Data.csv',sep=';', index = False)
    print('Number of rows stored - : ' + str(df_SIR_model.shape[0]))
    print('SIR Modelling Ended.')

SIR Modelling Started.
Number of rows stored - : 200
SIR Modelling Ended.


# 4) Visual Board


In [None]:
# %load ../src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
import dash_daq as daq

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import os
print(os.getcwd())
df_input_large=pd.read_csv(r'../data/processed/COVID_final_set.csv',sep=';')
df_input_recov=pd.read_csv(r'../data/processed/COVID_final_recov_set.csv',sep=';')
df_analyse = pd.read_csv(r'../data/processed/COVID_full_flat_table.csv',sep=';')
df_SIR_data = pd.read_csv(r'../data/processed/COVID_SIR_Model_Data.csv',sep=';')







fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),


    dcc.Markdown('''
    ## Select Country for Country level Stats
    '''),


    dcc.Dropdown(
        id='country_drop_down_stats',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value='India', # which is pre-selected
        multi=False
    ),
    html.Div(children=[
                dcc.Markdown('''
                **Scale Modes**
                '''),
                dcc.RadioItems(
                    options=[
                        {'label': 'Uniform', 'value': 'linear'},
                        {'label': 'Logarithmic', 'value': 'log'},
                        
                    ],
                    value='linear',
                    id='scale_type',
                    labelStyle={'display': 'inline-block'}
                ),
            
            ]),
    dcc.Graph( id='multi_graph'),




    dcc.Markdown('''
    ## Multi-Select Country for Doubling Rate Visualisation
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    # dcc.Markdown('''
    #     ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
    #     '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed_DR',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope'),

    dcc.Markdown('''

            ##  Multi-Select country for visualization.

            '''),


            dcc.Dropdown(

                id = 'country_drop_down_sir',

                options = [{'label':name, 'value':name} for name in df_input_large['country'].unique()],

                value = ['Germany', 'US'], # default selected values

                multi = True # for allowing multi value selection

            ),

            dcc.Graph(figure=fig, id='sir_chart')

])


@app.callback(
    Output('multi_graph', 'figure'),
    [Input('country_drop_down_stats', 'value'),
    Input('scale_type', 'value')]
)
def update_cummulative_stacked_plot(country,scale_type):
    


    traces = []
 
    df_plot=df_input_large[df_input_large['country'] == country]

        
    df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date','deaths']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)

    df_plot_recov = df_input_recov[df_input_recov['country'] == country]
    df_plot_recov=df_plot_recov[['state','country','recovered','date']].groupby(['country','date']).agg(np.sum).reset_index()


    fig=make_subplots(rows=4, cols=1,
                subplot_titles=("Total Confirmed", "Total Active ", "Total Recovered", 'Total Deaths'),
                shared_xaxes=False, 
                vertical_spacing=0.1,
    )
    fig.add_trace(go.Scatter(
                        x=df_plot.date,
                        y=df_plot['confirmed'],
                        mode='markers+lines',
                        opacity=0.9
                 ), row=1,col=1
    )
    fig.add_trace(go.Scatter(
                        x=df_plot.date,
                        y=df_plot['confirmed']-df_plot['deaths'],
                        mode='markers+lines',
                        opacity=0.9
                 ), row=2,col=1
    )
    fig.add_trace(go.Scatter(
                        x=df_plot_recov.date,
                        y=df_plot_recov['recovered'],
                        mode='markers+lines',
                        opacity=0.9
                        
                 ), row=3,col=1
    )
    fig.add_trace(go.Scatter(
                        x=df_plot.date,
                        y=df_plot['deaths'],
                        mode='markers+lines',
                        opacity=0.9
                 ), row=4,col=1
    )
    
    
    
    
    fig.update_xaxes(type="date",
                    tickangle=-45,
                    nticks=20,
                    tickfont=dict(size=14,color="#7f7f7f"), 
                    row=1, col=1)
    fig.update_xaxes(type="date",
                    tickangle=-45,
                    nticks=20,
                    tickfont=dict(size=14,color="#7f7f7f"), 
                    row=2, col=1)
    fig.update_xaxes(type="date",
                    tickangle=-45,
                    nticks=20,
                    tickfont=dict(size=14,color="#7f7f7f"), 
                    row=3, col=1)
    fig.update_xaxes(type="date",
                    tickangle=-45,
                    nticks=20,
                    tickfont=dict(size=14,color="#7f7f7f"), 
                    row=4, col=1)

    fig.update_yaxes(type=scale_type, row=1, col=1, title='Confirmed infected people')
    fig.update_yaxes(type=scale_type, row=2, col=1, title='Active infected people')
    fig.update_yaxes(type=scale_type, row=3, col=1, title='Recovered people')
    fig.update_yaxes(type=scale_type, row=3, col=1, title='Deaths')

    fig.update_layout(dict (

                width=1920,
                height=1080,
                template="plotly_dark"



                
        ))
    return fig



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')]
    )
def update_figure(country_list, show_doubling):
    
    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each,
                                
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }


@app.callback(
    Output('sir_chart', 'figure'),
    [Input('country_drop_down_sir', 'value')])
def update_figure(country_list):
    traces = []
    if(len(country_list) > 0):
        for each in country_list:
            country_data = df_analyse[each][35:]
            ydata = np.array(country_data)
            t = np.arange(len(ydata))
            fitted = np.array(df_SIR_data[each])
            #t, ydata, fitted = Handle_SIR_Modelling(ydata)            
            traces.append(dict(
                x = t,
                y = ydata,
                mode = 'markers+lines',
                name = each+str(' - Truth'),
                opacity = 0.9
            ))
            traces.append(
            dict(
                x = t,
                y = fitted,
                mode = 'markers+lines',
                name = each+str(' - Simulation'),
                opacity = 0.9
            ))
    return {
        'data': traces,
        'layout': dict(
            width = 1280,
            height = 720,
            title = 'Fit of SIR model for: '+', '.join(country_list),
            xaxis = {
                'title': 'Days', #'Fit of SIR model for '+str(each)+' cases',
                'tickangle': -45,
                'nticks' : 20,
                'tickfont' : dict(size = 14, color = '#7F7F7F')
            },
            yaxis = {
                'title': 'Population Infected',
                'type': 'log'
            }
        )
    }


if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


C:\Users\SURAJ BHAT\Desktop\eds-covid19\notebooks
Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
