NOTE: This is only intended to run with the Kedro Kernel built locally. For use externally in environements such as Google Colab and Databricks please reference the other notebooks in this folder marked as "External_"

# Load the final set from the pipeline run with all of the data 

In [None]:
%reload_kedro
print(catalog.list())

In [None]:
import pandas as pd
# load dataframe for feature analysis
combined_data = catalog.load("merge_who_ears_owid_data")
# First we need to update our date column to a standard datetime object in python
combined_data['date'] = pd.to_datetime(combined_data['date'], format = '%Y-%m-%d')

In [None]:
# install pycaret as a way to do some quick analysis of all regression model types to know which performs the best 
!pip install --pre pycaret
!pip install xgboost
!pip install --force-reinstall scikit-learn=='0.23.2' 
!pip install folium
!pip install dtale
# For mac you will have to do a conda install for lightgbm
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge lightgbm
!pip install -U numpy


In [None]:
from pycaret.regression import *

# importing the required libraries
import pandas as pd
import numpy as np
# Visualisation libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium 
from folium import plugins

# Manipulating the default plot size
plt.rcParams['figure.figsize'] = 10, 12

# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
combined_data.head()

In [None]:
combined_data.dtypes

# Select only the features we want in order to run the model analysis

In [None]:
# lets next create a sensible feature set for training and testing 
features = combined_data[['date', 'mis_and_disinformation', 'mis_and_disinformation_male',  
                          'mis_and_disinformation_female',
                         'myths','myths_female', 'myths_male', 'new_vaccinations_smoothed', ]]

features = features.eval("myths_and_misinfo = myths + mis_and_disinformation")
features = features.eval("myths_and_misinfo_male = myths_male + mis_and_disinformation_male")
features = features.eval("myths_and_misinfo_female = myths_female + mis_and_disinformation_female")

# weekly_hosp_admissions and new_cases_smoothed also a good indicator 

features = features.dropna()


In [None]:
features.head

In [None]:
# lets examine the feature set 
import dtale
dtale.show(features)

In [None]:
# lets setup our first session for eval
# we chose total cases since its a complete set AND is has high correlation with the other features (Pearson)
session_1 = setup(features, target = 'new_vaccinations_smoothed', 
                  session_id=12, 
                  log_experiment=False, 
                  experiment_name='new_vaccinations_smoothed_1')


In [None]:
# We can now compare all the regression models available 
best_model = compare_models()

# We need to get a closer look at whats happening in the data for Vaccinations as its something we might want to consider


# Experiment 2 - Vaccinations as target and overall analysis

In [None]:
columns = ['iso_code', 'continent', 'location', 'date', 'total_vaccinations', 
           'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
           'new_vaccinations', 'new_vaccinations_smoothed',
           'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
           'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
           'new_vaccinations_smoothed_per_million', 'new_people_vaccinated_smoothed',
           'new_people_vaccinated_smoothed_per_hundred']

vaccinations_owid = combined_data[columns]

vaccinations_owid.head


In [None]:
# lets examine the feature set 
import dtale
# vaccinations_owid = vaccinations_owid.dropna()
dtale.show(vaccinations_owid)


# We observe that new_vaccinations_smoothed is the most complete

In [None]:
features = combined_data[['date', 'mis_and_disinformation', 'mis_and_disinformation_complaints', 
                          'mis_and_disinformation_delta', 'mis_and_disinformation_male', 
                          'mis_and_disinformation_percent', 'mis_and_disinformation_questions', 
                          'mis_and_disinformation_female',
                         'myths', 'myths_complaints', 'myths_delta', 'myths_female', 
                          'myths_male', 'myths_percent', 'myths_questions','new_vaccinations_smoothed']]

features = features.dropna()

In [None]:
# lets setup our first session for eval
# we chose total cases since its a complete set AND is has high correlation with the other features (Pearson)
session_2 = setup(features, target = 'new_vaccinations_smoothed', 
                  session_id=13, 
                  log_experiment=False, 
                  experiment_name='new_vaccinations_smoothed_1')

In [None]:
# We can now compare all the regression models available 
best_model = compare_models()