In [49]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np



%matplotlib inline
mpl.rcParams['figure.figsize'] = (16, 10)
pd.set_option('display.max_rows', 500)

import plotly.graph_objects as go

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Load

In [50]:
# try to parse the dates right at the beginning 
# it works out of the box if the date was stored ISO YYYY-MM-DD format
# parse converts the date column from object type to datetime64[ns] format parse_date[0]; 0 is column number.
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])  
#instead of .head() we ried using .tail() for seeing last five rows.
df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,date,Italy,US,Spain,Germany,"Korea, South",India
176,2020-07-16,243736,3576157,258855,201450,13672,1003832
177,2020-07-17,243967,3647715,260255,202045,13711,1039084
178,2020-07-18,244216,3711413,260255,202426,13745,1077781
179,2020-07-19,244434,3773260,260255,202735,13771,1118206
180,2020-07-20,244624,3830010,264836,203325,13816,1155338


In [51]:
country_list=df_analyse.columns[1:]

# Helper Function

In [52]:
def quick_plot(x_in, df_input, y_scale='log',slider=False,mode_plot = 'markers+lines'):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        if (each[0:8]=='doubling'):
            fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        mode = 'markers',
                        line=dict(color='Grey', width=4, dash='dot'),
                        name=each,
                        opacity=0.10))
        
        else:
            fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        mode = mode_plot,
                        name=each,
                        opacity=0.8))
    
        
        
        
    fig.update_layout(autosize=True,
        width=1024,
        height=768,
        font=dict(
            family="PT Sans, monospace",
            size=18,
            color="#7f7f7f"
            )
        )
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,
                 nticks=20,
                 tickfont=dict(size=14,color="#7f7f7f")
                )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()
      

In [53]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True,mode_plot='markers+lines')

In [54]:
threshold=100

In [55]:
compare_list=[]
for pos,country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

In [56]:
pd_sync_timelines=pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T

In [57]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [58]:
pd_sync_timelines.head()

Unnamed: 0,Italy,US,Spain,Germany,"Korea, South",India,date
0,155.0,104.0,120.0,130.0,104.0,102.0,0
1,229.0,174.0,165.0,159.0,204.0,113.0,1
2,322.0,222.0,222.0,196.0,433.0,119.0,2
3,453.0,337.0,259.0,262.0,602.0,142.0,3
4,655.0,451.0,400.0,482.0,833.0,156.0,4


In [59]:
quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
           y_scale='log',
           slider=True)

# Doubling Rate

In [60]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [61]:
pd_sync_timelines.shape[0]

152

In [62]:
max_days=pd_sync_timelines.shape[0]

norm_slopes={
    'doubling every day':doubling_rate(100,np.arange(max_days+10),1),
    'doubling every two days':doubling_rate(100,np.arange(max_days+10),2),
    'doubling every 4 days':doubling_rate(100,np.arange(max_days+10),4),
    'doubling every 10 days':doubling_rate(100,np.arange(max_days+10),10),
}

In [63]:
pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [64]:
quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:50,0:8],
           y_scale='log',
           slider=True,mode_plot='markers+lines')


# Linear Regression

In [116]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)

In [117]:
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1,1)

y=np.log(np.array(df_analyse['Germany'][5:]))


In [118]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [119]:
X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)
#reg.predict([[]])

In [120]:
LR_inspect=df_analyse[['date','Germany']].copy()
LR_inspect['Prediction']=np.exp(Y_hat)
#LR_inspect

In [122]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True,
          mode_plot='lines')

# Doubling Rate - Piecewise Linear Regression

In [23]:
from scipy import signal

In [24]:
filter_cols=[]
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                                                   19, #window size used for filtering
                                                   2)#order of fitted polynomial
    filter_cols.append(each+'_filter')

In [25]:
quick_plot(df_analyse.date,
           df_analyse[filter_cols],
           y_scale='log',
           slider=True,
           mode_plot='lines')

In [26]:
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1,1)
y=(np.array(df_analyse['Germany'][5:]))

In [27]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [28]:
reg.intercept_

-22255.6184514638

In [29]:
reg.coef_

array([1528.41473035])

In [30]:
#Slope/interception gives doubling rate (approximation)
reg.coef_/reg.intercept_

array([-0.06867546])

In [31]:
def get_doubling_time_via_regression(in_array):
    ''' Using linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [32]:
for each in country_list:
    df_analyse[each+'DT']=df_analyse[each].rolling(window=3, min_periods=3).apply(get_doubling_time_via_regression)


In [33]:
quick_plot(df_analyse.date,
           df_analyse[['ItalyDT','USDT','SpainDT','GermanyDT','IndiaDT']],
           y_scale='log',
           slider=True,
          mode_plot='lines')

In [34]:
filter_cols_DT=[]
for each in filter_cols:
    df_analyse[each+'_DT']=df_analyse[each].rolling(window=3, min_periods=3).apply(get_doubling_time_via_regression)
    filter_cols_DT.append(each+'_DT')

In [35]:
len(df_analyse.columns)


25

In [45]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[19,20,21,22,23,24]],
           y_scale='log',
           slider=True,
          mode_plot='lines')

# Doubling Rate Checking

In [37]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])


In [38]:
df_analyse['Germany_DT_check']=df_analyse['Germany'].rolling(window=3, min_periods=3).apply(doubling_time)

In [48]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,[16,25]],
           y_scale='log',
           slider=True,
          mode_plot='lines')