Importing libraries

In [15]:

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

get_ipython().run_line_magic('matplotlib', 'inline')
mpl.rcParams['figure.figsize']=(20,16)
pd.set_option('display.max_rows',500)

import plotly.graph_objects as go

# Data load

In [16]:
df_analyse = pd.read_csv('../data/processed/COVID_small_flat_table.csv', sep = ';', parse_dates = [0])
# print(df_analyse.shape)
df_analyse.sort_values('date', ascending =True).tail()

Unnamed: 0,date,India,US,Italy,France,Vietnam,Germany
873,2022-06-13,43236695,85666440,17664043,29946647,10732429,26915085
874,2022-06-14,43245517,85758512,17703887,30123426,10733285,27007429
875,2022-06-15,43257730,85941290,17736696,30175534,10734151,27096571
876,2022-06-16,43270577,86057735,17773764,30228615,10734925,27124689
877,2022-06-17,43283793,86216418,17809934,30279240,10736408,27204953


In [17]:
country_list = df_analyse.columns[1:]
print(country_list)

Index(['India', 'US', 'Italy', 'France', 'Vietnam', 'Germany'], dtype='object')


# Helper Function

In [18]:
def quick_plot(x_in, df_input, y_scale = 'log', slider = False):
    """Quick basic plot for the quick and basic evaluation of the time series
    You can push the selective columns from your data using .iloc[0,6,7,8]
    
    Parameters
    ------------
    x_in : array
        array of date time object or the array of numbers
    df_input : pandas dataframe 
        The plotting matrix where each of the column is plotted 
        The name of the column will be used as a legend.
    scale : string
        Y-axis scale as a log or a linear
    slider : bool
        True or False for the x-axis slider
        
    Returns : 
    -----------
    """
    
    fig = go.Figure()
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(x = x_in, 
                             y = df_input[each], 
                            mode = 'markers+lines',
                            name = each))
    
    fig.update_layout(autosize = True,
                     width = 1024,
                     height = 768, 
                     font = dict(
                     family = "PT Sans, monospace", 
                     size =18, 
                     color = '#7f7f7f'))
    
    fig.update_yaxes(type = y_scale)
    fig.update_xaxes(tickangle = -45,
                    nticks =20,
                    tickfont = dict(size =14, color = "#7f7f7f")
                    )
    if slider == True:
        fig.update_layout(xaxis_rangeslider_visible = True)
    fig.show()
    
    

In [19]:
quick_plot(df_analyse.date, df_analyse.iloc[:,1:], y_scale = 'linear', slider =True)

In [20]:
threshold = 1000

In [21]:
from yaml import compose_all


compare_list = []
for pos, country in enumerate (df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country] > threshold])) # This is conditional isolating a data frame. Read the pandas documentation for more clarification

pd_sync_timeline = pd.DataFrame(compare_list, index = df_analyse.columns[1:]).T # Converting the list to the DataFrame. And adding the names of the countries as index. 
pd_sync_timeline.head() # Printing the dataframe head for better understanding.

Unnamed: 0,India,US,Italy,France,Vietnam,Germany
0,1024.0,1147.0,1128.0,1125.0,1007.0,1112.0
1,1251.0,1586.0,1694.0,1411.0,1009.0,1296.0
2,1397.0,2219.0,2036.0,1783.0,1014.0,1567.0
3,1998.0,2978.0,2502.0,2293.0,1016.0,2369.0
4,2543.0,3212.0,3089.0,2293.0,1022.0,3062.0


In [22]:
pd_sync_timeline['date'] = np.arange(pd_sync_timeline.shape[0])

In [23]:
pd_sync_timeline.head()

Unnamed: 0,India,US,Italy,France,Vietnam,Germany,date
0,1024.0,1147.0,1128.0,1125.0,1007.0,1112.0,0
1,1251.0,1586.0,1694.0,1411.0,1009.0,1296.0,1
2,1397.0,2219.0,2036.0,1783.0,1014.0,1567.0,2
3,1998.0,2978.0,2502.0,2293.0,1016.0,2369.0,3
4,2543.0,3212.0,3089.0,2293.0,1022.0,3062.0,4


In [24]:
quick_plot(pd_sync_timeline.date, pd_sync_timeline.iloc[:,:-1], y_scale = 'log', slider = True)

Exponential function for the doubling rate

$N(t) = N_0*2^{t/T}$

In [25]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2, t/T_d)

In [26]:
max_days = 34
# doubling_rate(1000,np.arange(max_days), 1) # Basic working principcle for the doubling rate. 

norm_slopes = {'doubling every 2 day' : doubling_rate(1000,np.arange(max_days), 2),
              'doubling every 4 day' : doubling_rate(1000,np.arange(max_days), 4),
              'doubling every 10 day' : doubling_rate(1000,np.arange(max_days), 10)}

In [27]:
pd_sync_timelines_w_slope = pd.concat([pd.DataFrame(norm_slopes), pd_sync_timeline], axis=1)
pd_sync_timelines_w_slope.head()

Unnamed: 0,doubling every 2 day,doubling every 4 day,doubling every 10 day,India,US,Italy,France,Vietnam,Germany,date
0,1000.0,1000.0,1000.0,1024.0,1147.0,1128.0,1125.0,1007.0,1112.0,0
1,1414.213562,1189.207115,1071.773463,1251.0,1586.0,1694.0,1411.0,1009.0,1296.0,1
2,2000.0,1414.213562,1148.698355,1397.0,2219.0,2036.0,1783.0,1014.0,1567.0,2
3,2828.427125,1681.792831,1231.144413,1998.0,2978.0,2502.0,2293.0,1016.0,2369.0,3
4,4000.0,2000.0,1319.507911,2543.0,3212.0,3089.0,2293.0,1022.0,3062.0,4


In [28]:
quick_plot(pd_sync_timelines_w_slope.date,
          pd_sync_timelines_w_slope.iloc[:,0:5],
          y_scale = 'log',
          slider = True)

In [29]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv', sep = ';', index = False)

# Linear Regression

In [30]:
"""# Eliminating all the '0'from the data frame. Because any value divided by 0 or log 0 is not defined. 

country_list = ['India', 'US', 'Italy', 'France','Vietnam']

for each in country_list:
    df_analyse = df_analyse[df_analyse[each] != 0]
    
df_analyse.head()

# it may be possible that we might loose some of the values in the data. But that will not affect the overall results. """

"# Eliminating all the '0'from the data frame. Because any value divided by 0 or log 0 is not defined. \n\ncountry_list = ['India', 'US', 'Italy', 'France','Vietnam']\n\nfor each in country_list:\n    df_analyse = df_analyse[df_analyse[each] != 0]\n    \ndf_analyse.head()\n\n# it may be possible that we might loose some of the values in the data. But that will not affect the overall results. "

In [31]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept = False)
from sklearn.metrics import r2_score

In [32]:
# Generating the data for training model

# Since we cannot train the model based on the dated, we need to somehow arrange the numbers corresponding to each infection value. That is why the 'l_vec' is used. 

# defining the length and the training variable for the linear regression model. 
l_vec = len(df_analyse['Germany'])
X = np.arange(l_vec-5).reshape(-1,1)

# Defining the dependent variable as a log and linear to check which one gives better accuracy
y_log= np.log(np.array(df_analyse['Germany'][5:]))


In [33]:
# Training the linear model on logarithmic output. 

reg.fit(X,y_log)

LinearRegression(fit_intercept=False)

In [34]:
X_hat = np.arange(l_vec).reshape(-1,1)
y_hat = reg.predict(X_hat)

In [35]:
LR_inspect = df_analyse[['date', 'Germany']].copy()


In [36]:

LR_inspect['Prediction_log']= np.exp(y_hat)


In [37]:
quick_plot(LR_inspect.date, 
          LR_inspect.iloc[:, 1:],
          y_scale = 'log',
          slider = True)

# Doubling Rate - Piecewise Linear Regression

In [102]:
reg = linear_model.LinearRegression(fit_intercept = True)

l_vec = len(df_analyse['Germany'])
X = np.arange(l_vec-50).reshape(-1,1)
y = np.array(df_analyse['Germany'][50:])

In [143]:
df_analyse.head()

Unnamed: 0,date,India,US,Italy,France,Vietnam,Germany,India_filter,US_filter,Italy_filter,...,Vietnam_filter,Germany_filter,India_DR,US_DR,Italy_DR,France_DR,Vietnam_DR,Germany_DR,Germany_DR_math,Germany_DR_filter
0,2020-01-22,0,1,0,0,0,0,0.0,1.0,0.0,...,9.189755e-16,0.0,,,,,,,,0.0
1,2020-01-23,0,1,0,0,2,0,0.0,1.0,0.0,...,2.0,0.0,,,,,,,,0.0
2,2020-01-24,0,2,0,2,2,0,0.0,2.0,0.0,...,2.0,0.0,,2.666667,,0.666667,1.333333,,,0.0
3,2020-01-25,0,2,0,3,2,0,0.0,2.0,0.0,...,2.0,0.0,,3.333333,,1.111111,inf,,,0.0
4,2020-01-26,0,5,0,3,2,0,0.0,5.0,0.0,...,2.0,0.333333,,2.0,,5.333333,inf,,,0.333333


In [104]:
# Definging the linear function on 3 data points

def get_rate_via_regression(in_array):
    """Use the linear regression to approximate the slope"""
    y = np.array(in_array)
    x = np.arange(-1,2).reshape(-1,1) # Keeping the number of rows -1 means that we do not know the exact dimension and we would like the numpy to figure it out.
                                        # But, we do know the column should be 1. 

    assert len(in_array) == 3

    reg.fit(x,y)
    intercept = reg.intercept_
    slope = reg.coef_

    return intercept/slope



In [105]:
def doubling_time(in_array):
    '''using doubling rate formula'''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])
    

In [106]:
# Calculate the slope of regression of last x days
# Using the limited number of points to better approximate the slope of the points. 
days_back = 3

for pos, country in enumerate(country_list):
    df_analyse[country+'_DR'] = df_analyse[country].rolling(
                                                            window=days_back,
                                                            min_periods=days_back).apply(get_rate_via_regression, raw = False)
    


In [107]:
df_analyse['Germany_DR'] = df_analyse['Germany'].rolling(window = 3, min_periods= 3).apply(get_rate_via_regression)



In [108]:
quick_plot(df_analyse.date, df_analyse.loc[40:,["India_DR", "US_DR", "Italy_DR", "Vietnam_DR","Germany_DR"]], y_scale='linear')

In [109]:
# Check the doubling rate by using the mathematical formula
df_analyse['Germany_DR_math']=df_analyse['Germany'].rolling(window = 3, min_periods=3).apply(doubling_time)



In [110]:
# Plot the mathematical and analytical doubling time on the same graph to check if the analytical one matches the mathematical one
quick_plot(df_analyse.date, df_analyse.loc[40:,["Germany_DR", "Germany_DR_math"]], y_scale='linear',slider=True)

In [111]:
from scipy import signal


In [139]:
# Trying out filter on a single country case
# The filter used in this case is a Savitzky-Golay Filter. For more information regarding the same, check the documentaion and internet
df_analyse['Germany' + '_filter'] = signal.savgol_filter(df_analyse['Germany'], 
                                                    window_length= 3,
                                                    polyorder=1)

In [142]:
quick_plot(df_analyse.date, df_analyse.loc[:, ['Germany','Germany_filter']], y_scale='linear',slider=True)

In [128]:
# Iterating through the list of countries and using the filter to plot the points precisely

country_list = ['India', 'US', 'Italy', 'Vietnam', 'Germany']
for each in country_list:
    df_analyse[each + '_filter'] = signal.savgol_filter(df_analyse[each], 
                                                        window_length= 3,
                                                        polyorder=2)

df_analyse.columns

Index(['date', 'India', 'US', 'Italy', 'France', 'Vietnam', 'Germany',
       'India_filter', 'US_filter', 'Italy_filter', 'France_filter',
       'Vietnam_filter', 'Germany_filter', 'India_DR', 'US_DR', 'Italy_DR',
       'France_DR', 'Vietnam_DR', 'Germany_DR', 'Germany_DR_math'],
      dtype='object')

In [133]:
# for each in country_list:
#     quick_plot(df_analyse.date, df_analyse.loc[:, [each+'_filter']], y_scale='linear')
#     plt.hold()

In [135]:
quick_plot(df_analyse.date, df_analyse.loc[:, ['India_filter', 'US_filter', 'Italy_filter', 'Vietnam_filter', 'Germany_filter']], y_scale='linear')