# Evaluation

In [13]:
# make path change
import os

if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+ os.path.split(os.getcwd())[-1]

'Your base path is at: eds'

# 1. Update the data

In [None]:
## Load the data (or)
# %load src/data/get_data.py

##### libraries ######

import pandas as pd
import numpy as np

import subprocess
import os

from datetime import datetime

import requests
import json


##### gitub data #####

def get_johns_hopkins():
    """ 
    Get or update data by a git pull request, the source code has to be pulled first.
    Result is stored in the predifined csv structure
    """

    git_pull = subprocess.Popen( "git pull " ,
                         cwd = 'data/raw/COVID-19/',
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


if __name__ == '__main__':
    get_johns_hopkins()

# 2. Process pipeline

In [15]:
## Load the data (or)
# %load src/data/process_github_data.py


##### libraries #####

import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    """
    Transformes the COVID data in a relational data set

    """
    
    # Load raw data
    data_path='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    raw_df = pd.read_csv(data_path)

    # rename columns
    pd_data_base = raw_df.rename(columns={'Country/Region':'country', 'Province/State':'state'})

    # fill countries without states with 'no'
    pd_data_base['state']=pd_data_base['state'].fillna('no')

    # drop lat and lon position columns
    pd_data_base = pd_data_base.drop(['Lat','Long'],axis=1)

    # change date to rows and countries to column
    pd_relational_model = pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )
    
    # change format
    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    # save to drive
    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))

if __name__ == '__main__':

    store_relational_JH_data()

 Number of rows stored: 63042
 Latest date is: 2020-09-14 00:00:00


# 3. Filter/ smoothing and Doubling Rate Calculation

In [16]:
# %load src/features/build_features.py
##### libraries #####
import numpy as np
from sklearn import linear_model

import pandas as pd

from scipy import signal

# regression model
reg = linear_model.LinearRegression(fit_intercept=True)

def get_doubling_time_via_regression(input_array):
    """
        Use a linear regression to approximate the doubling rate

            Parameters:
            ----------
            input_array : pandas.series

            Returns:
            ----------
            Doubling rate: double
    """

    y = np.array(input_array)
    X = np.arange(-1, 2).reshape(-1, 1)

    assert len(input_array)==3

    reg.fit(X, y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input, column='confirmed', window=3, degree=1):
    """
       Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used for data points to calculate the filter result
        degree : int
            polynomial degree

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    
    """

    
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           degree  # degree of ploynomial
                               )

    df_result[str(column+'_filtered')]=result

    return df_result


def rolling_reg(df_input,col='confirmed', days_back=3):
    """
        Rolling Regression to approximate the doubling time'

            Parameters:
            ----------
            df_input: pd.DataFrame
            col: str
                defines the used column
            days_back : int
                time period days for rolling 

            Returns:
            ----------
            result: pd.DataFrame
    
    """
    
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

    return result


def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  
    Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column

        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])

    assert must_contain.issubset(set(df_input.columns)), ' Error in -calc_filtered_data-func, not all required columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True, right_index=True, how='left')
    #print(df_output[df_output['country']=='Germany'].tail())

    return df_output.copy()


def calc_doubling_rate(df_input, filter_on='confirmed'):
    ''' 
    Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column

        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Error in -calc_filtered_data-func, not all required columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':

    # test structure
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))


    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())


the test slope is: [2.]
            date state  country  confirmed  confirmed_filtered  confirmed_DR  \
34360 2020-09-10    no  Germany   258149.0       258105.666667    160.722431   
34361 2020-09-11    no  Germany   259735.0       259567.000000    156.332930   
34362 2020-09-12    no  Germany   260817.0       260763.000000    194.577961   
34363 2020-09-13    no  Germany   261737.0       261925.333333    260.502498   
34364 2020-09-14    no  Germany   263222.0       263127.833333    217.817325   

       confirmed_filtered_DR  
34360             165.492743  
34361             168.999272  
34362             195.292440  
34363             221.132250  
34364             221.528273  


# 4 Visual Board

In [18]:
# %load src/visualization/visualize.py
##### libraries #####
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of this Project is to learn Industrial Aproach of Data Science which is called as Cross Industrial Standard Process(CRISP-DM).
    In this aproach it covers the full walkthrough: how to  gather, update,filter, transform and Automate the data.
    How to use Machine Learning approaches like Linear Regression to approximate the doubling rate ,(static) deployment of responsive
    dashboard for visualization.
    

    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','India'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])

def update_figure(country_list, show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'No. of Confirmed infected people (source: johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
             
            'layout': dict (
                width = 1280,
                height = 720,
                
                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#171717"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False, port = 8908) 


C:\Users\prudh\Desktop\desktop\4th semster\DataScience\eds
Dash is running on http://127.0.0.1:8908/

Dash is running on http://127.0.0.1:8908/

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.



 * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
