In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 

# Plotly for some fancy visulization, reference from kaggle notebook and plotly library
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read Data

In [None]:
df_data=pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')
df_m_data=pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations_by_manufacturer.csv')

In [None]:
display(df_data.head())

In [None]:
display(df_m_data.head())

In [None]:
print(df_data.shape)
print(df_m_data.shape)

In [None]:
print(df_data.nunique())
print(df_data.dtypes)
df_data.country.unique()

In [None]:
print(df_m_data.nunique())
print(df_m_data.dtypes)
df_m_data.location.unique()

# Overview

We could draw a treemap to see the vaccination progress:

**[Reference note of the treemap](https://plotly.com/python/treemaps/)**

In [None]:
# Draw the treemap to see the vaccine schemes (Package: plotly)
country_total = df_data.groupby(["country", "vaccines"])['total_vaccinations', 'total_vaccinations_per_hundred'].max().reset_index()

fig = px.treemap(country_total, path = ['vaccines', 'country'], values = 'total_vaccinations',
                title="Total Vaccinations Tree Map")
fig.show()

[Reference of the worldmap](https://plotly.com/python/reference/scattergeo/#scattergeo-locationmode)

In [None]:
# Worldmap
trace = go.Choropleth(
            locations = country_total['country'],
            locationmode='country names',
            z = country_total['total_vaccinations'],
            text = country_total['country'],
            autocolorscale =False,
            reversescale = True,
            colorscale = 'viridis',
            marker = dict(
                line = dict(
                    color = 'rgb(0,0,0)',
                    width = 0.5)
            ),
            colorbar = dict(
                title = 'Total vaccinations',
                tickprefix = '')
        )

data = [trace]
layout = go.Layout(
    title = 'Total vaccinations per country',
    geo = dict(
        showframe = True,
        showlakes = False,
        showcoastlines = True,
        projection = dict(
            type = 'natural earth'
        )
    )
)

fig = dict( data=data, layout=layout )
iplot(fig)

In [None]:
trace = go.Choropleth(
            locations = country_total['country'],
            locationmode='country names',
            z = country_total['total_vaccinations_per_hundred'],
            text = country_total['country'],
            autocolorscale =False,
            reversescale = True,
            colorscale = 'viridis',
            marker = dict(
                line = dict(
                    color = 'rgb(0,0,0)',
                    width = 0.5)
            ),
            colorbar = dict(
                title = 'Total vaccinations per hundred people',
                tickprefix = '')
        )

data = [trace]
layout = go.Layout(
    title = 'Total vaccinations per hundred people per country',
    geo = dict(
        showframe = True,
        showlakes = False,
        showcoastlines = True,
        projection = dict(
            type = 'natural earth'
        )
    )
)

fig = dict( data=data, layout=layout )
iplot(fig)

# Vaccination Progress by Country

In this section, we will look at the covid-19 and vaccination situation by country. Specifically, we choose US, UK and HK for analysis.Here, India is not chosen because the covid situation there is out of control, not comparable with other countries we choose. Moreover, Hong Kong instead of China is chosen because China almost has no new local cases now. Below are the data we get, which will be used for visualization later.

* **Data Preprocess**

In [None]:
US_data=df_data.loc[df_data['country'] == 'United States']
HK_data=df_data.loc[df_data['country'] == 'Hong Kong']
UK_data=df_data.loc[df_data['country'] == 'United Kingdom']
US_m_data=df_m_data.loc[df_m_data['location'] == 'United States']

In [None]:
display(US_data.head())
display(US_m_data.head())

In [None]:
display(UK_data.head())
display(HK_data.head())

* **Types of vaccines**

As can be seen from this dataset, vaccines in USA are produced by Johnson&Johnson, Moderna and Pfizer/BioNTech; Vaccines in UK are produced by Moderna, Oxford/AstraZeneca and Pfizer/BioNTech; Vaccines in HK are produced by Pfizer/BioNTech and Sinovac. We will see later how vaccines affect the situation of Covid-19. And for the US, we have additional data to analyse the daily vaccination of different vaccines.

In [None]:
print(US_m_data.nunique())
US_m_data.vaccine.unique()

In [None]:
print(US_data.vaccines.unique())
print(UK_data.vaccines.unique())
print(HK_data.vaccines.unique())

# Vaccination Progress in US by different manufacturers


In [None]:
us_vaccine_time = US_m_data[["location", "vaccine", "date", 'total_vaccinations']].dropna()
us_vaccine_time.columns = ["location", "vaccine", "Date", 'Total vaccinations']
vaccines = ['Moderna', 'Pfizer/BioNTech', 'Johnson&Johnson']
def plot_vaccine_time(data_df, feature, title, vaccines):
    data = []
    for vaccine in vaccines:
        df = data_df.loc[data_df.vaccine==vaccine]
        trace = go.Scatter(
            x = df['Date'],y = df[feature],
            name = vaccine,
            mode = "lines",
            marker_line_width = 1,
            marker_size = 8,
            marker_symbol = 'circle',
            text=df['vaccine'])
        data.append(trace)
    layout = dict(title = title,
          xaxis = dict(title = 'Date', showticklabels=True,zeroline=True, zerolinewidth=1, zerolinecolor='grey',
                       showline=True, linewidth=2, linecolor='black', mirror=True,
                       tickfont=dict(size=10,color='black'),), 
          yaxis = dict(title = feature, gridcolor='lightgrey', zeroline=True, zerolinewidth=1, zerolinecolor='grey',
                       showline=True, linewidth=2, linecolor='black', mirror=True, type="linear"),
                       plot_bgcolor = 'rgba(0, 0, 0, 0)', paper_bgcolor = 'rgba(0, 0, 0, 0)',
         hovermode = 'x', 
         height=400
         )
    fig = dict(data=data, layout=layout)
    iplot(fig, filename='US_vaccine')
    
plot_vaccine_time(us_vaccine_time, 'Total vaccinations', 'Total vaccinations', vaccines)

# Vaccination Progress by date

In [None]:
country_vaccine_time = df_data[["country", "vaccines", "date", 'total_vaccinations', 
                                'total_vaccinations_per_hundred',  'people_vaccinated','people_vaccinated_per_hundred',
                               'daily_vaccinations','daily_vaccinations_per_million', 
                                'people_fully_vaccinated', 'people_fully_vaccinated_per_hundred'
                               ]].dropna()
country_vaccine_time.columns = ["Country", "Vaccines", "Date", 'Total vaccinations', 'Percent', 'People vaccinated', 'People vaccinated percent',
                               "Daily vaccinations", "Daily vaccinations per million", 
                                'People fully vaccinated', 'People fully vaccinated percent']
countries = ['Hong Kong', 'United Kingdom', 'United States']
def plot_time_variation_countries_group(data_df, feature, title, countries):
    data = []
    for country in countries:
        df = data_df.loc[data_df.Country==country]
        trace = go.Scatter(
            x = df['Date'],y = df[feature],
            name=country,
            mode = "lines",
            marker_line_width = 1,
            marker_size = 8,
            marker_symbol = 'circle',
            text=df['Country'])
        data.append(trace)
    layout = dict(title = title,
          xaxis = dict(title = 'Date', showticklabels=True,zeroline=True, zerolinewidth=1, zerolinecolor='grey',
                       showline=True, linewidth=2, linecolor='black', mirror=True,
                       tickfont=dict(size=10,color='black'),), 
          yaxis = dict(title = feature, gridcolor='lightgrey', zeroline=True, zerolinewidth=1, zerolinecolor='grey',
                       showline=True, linewidth=2, linecolor='black', mirror=True, type="linear"),
                       plot_bgcolor = 'rgba(0, 0, 0, 0)', paper_bgcolor = 'rgba(0, 0, 0, 0)',
         hovermode = 'x', 
         height=400
         )
    fig = dict(data=data, layout=layout)
    iplot(fig, filename='US_UK_HK')

In [None]:
plot_time_variation_countries_group(country_vaccine_time, 'Percent', 'Total vaccination percentage', countries)

In [None]:
plot_time_variation_countries_group(country_vaccine_time, 'People vaccinated percent', 'People vaccinated percentage', countries)

In [None]:
plot_time_variation_countries_group(country_vaccine_time, 'People fully vaccinated percent', 'People fully vaccinated percentage', countries)

The total vaccination percentages in UK and US are almost the same, while in UK more percent of people are vaccinated. This indicated that in US, more percent of people will be fully vaccinated, as shown in the third graph.

# Linear Regression for People Vaccined

In [None]:
from scipy import stats
plt.style.use('seaborn-whitegrid')

regression = df_data[["country", "date", 'people_vaccinated_per_hundred']].dropna()
countries = ['Hong Kong', 'United Kingdom', 'United States']

for country in countries: 
    df = regression.loc[regression.country==country]
    y = df['people_vaccinated_per_hundred'].values
    x_date = df['date'].values
    x = range(1, len(y)+1)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    plt.plot(x,y,label="Original data")
    plt.plot(x,slope*x+intercept,label="Data using regression")
    plt.xlabel("Time starting from "+x_date[0])
    plt.ylabel("People vaccinated percent")
    plt.title("Regression Figure in "+country)
    plt.legend()
    plt.show()
    print(country,": slope =", slope, ", intercept =", intercept)
    
    
    


# SIR Model to Predict Future Tendency

**We now define the SIR model with vaccination factor using Pytorch:**
* I: Infected; 
* R: Removed; 
* S: Susceptible; 
* Beta: Infection rate per infected person; 
* Gamma: Recover rate of the infected; 
* k: vaccine effect factor;

**Model:**
* N = S + I + R
* S(t+1) = S(t) - S(t) * beta * I(t)- k * N
* I(t+1) = I(t) + S(t) * beta * I(t) - gamma * I(t)
* R(t+1) = R(t) + gamma * I(t) + k * N

**Revise:**
* Beta = infection ability (# of cases per infected person)

In [None]:
country_covid = ['China Hong Kong Sar', 'USA', 'UK']
import torch
torch.set_default_tensor_type(torch.DoubleTensor)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, country_data, is_full_dataset=False):
        if is_full_dataset:
            self.data = torch.tensor(country_data, requires_grad=True)
        else:
            self.data = torch.tensor(country_data, requires_grad=True)
            self.data = self.data[:20]
            print(self.data)
    def __len__(self):
        return len(self.data)-1
    def __getitem__(self,index):
        return self.data[index],self.data[index+1]

In [None]:
# Define SIR Model with vaccination
def SIR_V(St,It,Rt,beta,gamma,k):
    N = St + It + Rt
    S_t1 = St - St * beta * It - k * N
    I_t1 = It + St * beta * It - gamma * It
    R_t1 = Rt + gamma * It + k * N
    return S_t1, I_t1, R_t1

In [None]:
class Feedforward(torch.nn.Module):
    def __init__(self, is_debug = True):
        super(Feedforward, self).__init__()
        self.is_debug = is_debug
        # Initial Value：This is important
        self.beta = torch.nn.Parameter(torch.tensor(0.002))
        self.beta.requires_grad = True
        self.gamma = torch.nn.Parameter(torch.tensor(0.001))
        self.gamma.requires_grad = True
        # Based on the slope of the curve in last section
        self.k = torch.nn.Parameter(torch.tensor(0.0001))
        self.k.requires_grad = True
    def forward(self,data):
        St,It,Rt = data
        N = St + It + Rt
        S_t1 = St - self.beta * It - self.k * N
        I_t1 = It + self.beta * It - self.gamma * It
        R_t1 = Rt + self.gamma * It + self.k * N
        if self.is_debug:
            print(self.beta,self.gamma,self.k)
        return S_t1, I_t1, R_t1

model = Feedforward(is_debug=False)
# This is discarded later and replaced by handwritten loss function
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.00001)

* Train parameters using USA data.

In [None]:
# Get US data
covid = pd.read_csv('../input/covid19-global-dataset/worldometer_coronavirus_daily_data.csv')
country_data_US = covid.loc[covid['country'] == 'USA']
print(country_data_US.nunique())

country_data_US['Susceptible'] = 329227746-country_data_US['cumulative_total_cases']
country_data_US['Infected'] = country_data_US['active_cases'] 
country_data_US['Recovered'] = country_data_US['cumulative_total_cases'] - country_data_US['active_cases'] 
country_data_US = country_data_US[['Susceptible','Infected','Recovered']]
# Starting from Jan13 2021 (when we have vaccination data for USA)
# country_data_US = country_data_US.tail(119)
# Use Recent Data when vaccination is more stable
country_data_US = country_data_US.tail(80)
country_data_US = country_data_US.values

# Initialize a dataset
US_Covid_dataset = Dataset(country_data_US)
US_dataloader = torch.utils.data.DataLoader(US_Covid_dataset, batch_size=1, shuffle=True, num_workers=0)

loss_log = []
for epoch in range(20):
    train_loss = 0
    for i, data in enumerate(US_dataloader):
        optimizer.zero_grad()
        x_train = data[0][0]
        y_train = data[1][0]
        # x_train = torch.squeeze(x_train)
        # y_train = torch.squeeze(y_train)
        # Forward pass
        s, i, r = model(x_train)
        # Compute Loss
        loss = (y_train[0]-s).pow(2) + (y_train[1] - i).pow(2) + (y_train[2] - r).pow(2)
        train_loss += loss.item()
        # Backward pass
        loss.backward()
        # print(loss.grad)
        # print(y_pred.grad)
        optimizer.step()
    loss_log.append(train_loss)
    #print('Epoch {}: train loss: {}'.format(epoch, train_loss))

torch.save(model.state_dict(),'./US_SIR_parameter.pth')

# Plot loss
def plot_loss(epoch,log, country):
    plt.plot(epoch, log)
    plt.xlabel("Epoch for "+ country)
    plt.ylabel("MSELoss")
    plt.title("Training Loss in "+country)
    plt.legend()
    plt.show()

plot_loss(range(20),loss_log, "USA")

* Train parameter using HK data.

In [None]:
# Get HK data
covid = pd.read_csv('../input/covid19-global-dataset/worldometer_coronavirus_daily_data.csv')
country_data_HK = covid.loc[covid['country'] == 'China Hong Kong Sar']
print(country_data_HK.nunique())

country_data_HK['Susceptible'] = 7507000-country_data_HK['cumulative_total_cases']
country_data_HK['Infected'] = country_data_HK['active_cases']
country_data_HK['Recovered'] = country_data_HK['cumulative_total_cases'] - country_data_HK['active_cases']
country_data_HK = country_data_HK[['Susceptible', 'Infected', 'Recovered']]
# Starting from Mar 2021 (when we have vaccination data for HK)
country_data_HK = country_data_HK.tail(70)
country_data_HK = country_data_HK.values

# Initialize a dataset
HK_Covid_dataset = Dataset(country_data_HK)
HK_dataloader = torch.utils.data.DataLoader(HK_Covid_dataset, batch_size=1, shuffle=True, num_workers=0)

loss_log = []
for epoch in range(500):
    train_loss = 0
    for i, data in enumerate(HK_dataloader):
        optimizer.zero_grad()
        x_train = data[0][0]
        y_train = data[1][0]
        # x_train = torch.squeeze(x_train)
        # y_train = torch.squeeze(y_train)
        # Forward pass
        s, i, r = model(x_train)
        # Compute Loss
        loss = (y_train[0]-s).pow(2) + (y_train[1] - i).pow(2) + (y_train[2] - r).pow(2)
        train_loss += loss.item()
        # Backward pass
        loss.backward()
        # print(loss.grad)
        # print(y_pred.grad)
        optimizer.step()
    #print('Epoch {}: train loss: {}'.format(epoch, train_loss))
    loss_log.append(train_loss)

torch.save(model.state_dict(),'./HK_SIR_parameter.pth')

plot_loss(range(500),loss_log, "HK")

* Train parameter using UK data

In [None]:
# Get UK data
covid = pd.read_csv('../input/covid19-global-dataset/worldometer_coronavirus_daily_data.csv')
country_data_UK = covid.loc[covid['country'] == 'UK']
print(country_data_UK.nunique())

country_data_UK['Susceptible'] = 66650000-country_data_UK['cumulative_total_cases']
country_data_UK['Infected'] = country_data_UK['active_cases']
country_data_UK['Recovered'] = country_data_UK['cumulative_total_cases'] - country_data_UK['active_cases']
country_data_UK = country_data_UK[['Susceptible','Infected','Recovered']]
# Starting from Jan 2021 (when we have vaccination data for UK)
# country_data_UK = country_data_UK.tail(122)
country_data_UK = country_data_UK.tail(80)
country_data_UK = country_data_UK.values

# Initialize a dataset
UK_Covid_dataset = Dataset(country_data_UK)
UK_dataloader = torch.utils.data.DataLoader(UK_Covid_dataset, batch_size=1, shuffle=True, num_workers=0)

loss_log = []
for epoch in range(100):
    train_loss = 0
    for i, data in enumerate(UK_dataloader):
        optimizer.zero_grad()
        x_train = data[0][0]
        y_train = data[1][0]
        # x_train = torch.squeeze(x_train)
        # y_train = torch.squeeze(y_train)
        # Forward pass
        s, i, r = model(x_train)
        # Compute Loss
        loss = (y_train[0]-s).pow(2) + (y_train[1] - i).pow(2) + (y_train[2] - r).pow(2)
        train_loss += loss.item()
        # Backward pass
        loss.backward()
        # print(loss.grad)
        # print(y_pred.grad)
        optimizer.step()
    loss_log.append(train_loss)
    #print('Epoch {}: train loss: {}'.format(epoch, train_loss))

torch.save(model.state_dict(),'./UK_SIR_parameter.pth')
plot_loss(range(100),loss_log, "UK")

# Prediction using SIR parameters

* Define Plot Function

In [None]:
covid = pd.read_csv('../input/covid19-global-dataset/worldometer_coronavirus_daily_data.csv')
def plot_prediction(country_data, country, t, type_of_people, location):
    df = covid.loc[covid['country'] == location]
    date = df['date'].tail(t)
    date = date.values
    day = date[0]
    x = range(1, len(country_data)+1)
    
    input_data = torch.tensor(country_data[0])
    model = Feedforward(is_debug=False)
    model.load_state_dict(torch.load('./'+country+'_SIR_parameter.pth'))
    model.eval()
    
    prediction = []
    s, i, r = input_data
    convert = [s.item(), i.item(), r.item()]
    prediction.append(convert)
    
    for i in range(len(country_data)-1):
        input_data = model(input_data)
        s, i, r = input_data
        convert = [s.item(), i.item(), r.item()]
        prediction.append(convert)
    
    prediction = np.array(prediction)
    
    
    if type_of_people == 'S':
        plt.plot(x, country_data[:,0], label='original susceptible data for '+country)
        plt.plot(x, prediction[:,0], label='predicted susceptible data for '+country)
    if type_of_people == 'I':
        plt.plot(x, country_data[:,1], label='original infected data for '+country)
        plt.plot(x, prediction[:,1], label='predicted susceptible data for '+country)
    if type_of_people == 'R':
        plt.plot(x, country_data[:,2], label='original recovered data for '+country)
        plt.plot(x, prediction[:,2], label='predicted susceptible data for '+country)

    plt.xlabel("Time starting from date "+day)
    plt.ylabel("Number of cases")
    plt.title("Prediction Comparison for "+country)
    plt.legend()
    plt.show()

* Prediction -- US

In [None]:
plot_prediction(country_data_US, 'US', 80, 'S', 'USA')
plot_prediction(country_data_US, 'US', 80, 'I', 'USA')
plot_prediction(country_data_US, 'US', 80, 'R', 'USA')

* Prediction -- UK

In [None]:
plot_prediction(country_data_UK, 'UK', 80, 'S', 'UK')
plot_prediction(country_data_UK, 'UK', 80, 'I', 'UK')
plot_prediction(country_data_UK, 'UK', 80, 'R', 'UK')

* Prediction -- HK

In [None]:
plot_prediction(country_data_HK, 'HK', 70, 'S', 'China Hong Kong Sar')
plot_prediction(country_data_HK, 'HK', 70, 'I', 'China Hong Kong Sar')
plot_prediction(country_data_HK, 'HK', 70, 'R', 'China Hong Kong Sar')

As the SIR model with vaccination ignores many other factors, the model is not completely precise, but gives a generate trend.