# Load the final set from the pipeline run with all of the data 

In [None]:
%reload_kedro
print(catalog.list())

In [56]:
# load dataframe for feature analysis
combined_data = catalog.load("merge_who_ears_owid_data")
# First we need to update our date column to a standard datetime object in python
combined_data['date'] = pd.to_datetime(combined_data['date'], format = '%Y-%m-%d')

2023-01-11 06:22:07,254 - INFO     - Loading data from 'merge_who_ears_owid_data' (ParquetDataSet)...


In [None]:
# install pycaret as a way to do some quick analysis of all regression model types to know which performs the best 
!pip install --pre pycaret
!pip install xgboost
# !pip install --force-reinstall scikit-learn=='0.23.2' 
!pip install folium
!pip install dtale
# For mac you will have to do a conda install for lightgbm
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge lightgbm

In [None]:
from pycaret.regression import *

# importing the required libraries
import pandas as pd
import numpy as np
# Visualisation libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium 
from folium import plugins

# Manipulating the default plot size
plt.rcParams['figure.figsize'] = 10, 12

# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
combined_data.head()

In [None]:
combined_data.dtypes

# Select only the features we want in order to run the model analysis

In [57]:
# lets next create a sensible feature set for training and testing 
features = combined_data[['date', 'mis_and_disinformation', 'mis_and_disinformation_complaints', 
                          'mis_and_disinformation_delta', 'mis_and_disinformation_male', 
                          'mis_and_disinformation_percent', 'mis_and_disinformation_questions', 
                          'mis_and_disinformation_female',
                         'myths', 'myths_complaints', 'myths_delta', 'myths_female', 
                          'myths_male', 'myths_percent', 'myths_questions', 'total_cases', ]]
# weekly_hosp_admissions and new_cases_smoothed also a good indicator 

features = features.dropna()


In [None]:
features.head

In [None]:
# lets examine the feature set 
import dtale
dtale.show(features)

In [58]:
# lets setup our first session for eval
# we chose total cases since its a complete set AND is has high correlation with the other features (Pearson)
session_1 = setup(features, target = 'total_cases', 
                  session_id=12, 
                  log_experiment=False, 
                  experiment_name='total_cases_1')


Unnamed: 0,Description,Value
0,Session id,12
1,Target,new_cases_smoothed
2,Target type,Regression
3,Data shape,"(22545, 16)"
4,Train data shape,"(15781, 16)"
5,Test data shape,"(6764, 16)"
6,Numeric features,14
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


In [59]:
# We can now compare all the regression models available 
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,6442.2119,289787743.3724,16859.9494,0.8201,2.5361,60.645,0.583
xgboost,Extreme Gradient Boosting,6702.8548,321091193.6,17744.2508,0.7997,2.6481,59.5146,0.327
lightgbm,Light Gradient Boosting Machine,6814.2611,331730968.4836,18033.9029,0.79,2.7153,63.0598,0.058
rf,Random Forest Regressor,6661.0252,337743267.2244,18196.6859,0.7881,2.5546,76.6809,1.099
gbr,Gradient Boosting Regressor,7862.7086,425869499.9656,20481.8796,0.7301,2.8477,80.7141,0.331
dt,Decision Tree Regressor,8398.3597,732576697.8005,26765.3166,0.5297,2.7095,53.4979,0.038
llar,Lasso Least Angle Regression,11350.6766,1033089299.5917,31833.3944,0.3663,3.1554,148.2037,0.022
knn,K Neighbors Regressor,11176.8003,1027420544.0,31857.0607,0.3595,2.8687,102.1863,0.029
br,Bayesian Ridge,11441.495,1090207973.7871,32655.3931,0.3187,3.1188,139.4636,0.023
en,Elastic Net,11456.0308,1090539260.7324,32659.7951,0.3184,3.1301,142.6598,0.086


# We need to get a closer look at whats happening in the data for Vaccinations as its something we might want to consider


# Experiment 2 - Vaccinations as target and overall analysis

In [46]:
columns = ['iso_code', 'continent', 'location', 'date', 'total_vaccinations', 
           'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
           'new_vaccinations', 'new_vaccinations_smoothed',
           'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
           'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
           'new_vaccinations_smoothed_per_million', 'new_people_vaccinated_smoothed',
           'new_people_vaccinated_smoothed_per_hundred']

vaccinations_owid = combined_data[columns]

vaccinations_owid.head


In [47]:
# lets examine the feature set 
import dtale
# vaccinations_owid = vaccinations_owid.dropna()
dtale.show(vaccinations_owid)


# We observe that new_vaccinations_smoothed is the most complete

In [63]:
features = combined_data[['date', 'mis_and_disinformation', 'mis_and_disinformation_complaints', 
                          'mis_and_disinformation_delta', 'mis_and_disinformation_male', 
                          'mis_and_disinformation_percent', 'mis_and_disinformation_questions', 
                          'mis_and_disinformation_female',
                         'myths', 'myths_complaints', 'myths_delta', 'myths_female', 
                          'myths_male', 'myths_percent', 'myths_questions','new_vaccinations']]

features = features.dropna()

In [64]:
# lets setup our first session for eval
# we chose total cases since its a complete set AND is has high correlation with the other features (Pearson)
session_2 = setup(features, target = 'new_vaccinations', 
                  session_id=12, 
                  log_experiment=False, 
                  experiment_name='new_vaccinations_1')

Unnamed: 0,Description,Value
0,Session id,12
1,Target,new_vaccinations
2,Target type,Regression
3,Data shape,"(10864, 16)"
4,Train data shape,"(7604, 16)"
5,Test data shape,"(3260, 16)"
6,Numeric features,14
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


In [65]:
# We can now compare all the regression models available 
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,312778.9502,634686208839.0763,787448.5541,0.5055,2.3689,171.3446,0.264
lightgbm,Light Gradient Boosting Machine,314626.1028,673177453336.021,810815.8705,0.4766,2.4243,215.9423,0.044
rf,Random Forest Regressor,315939.999,699265132108.9382,826574.3363,0.4556,2.3248,233.5203,0.527
gbr,Gradient Boosting Regressor,350312.1107,725246067156.1937,842998.0186,0.4318,2.5874,304.7876,0.18
xgboost,Extreme Gradient Boosting,336903.6344,733380362240.0,846175.65,0.4285,2.4309,170.773,0.174
llar,Lasso Least Angle Regression,476056.3925,1198798258823.941,1086888.4365,0.0543,3.241,1213.3451,0.017
lr,Linear Regression,476133.4986,1199128533454.1484,1087041.3653,0.054,3.2411,1213.0752,0.367
lasso,Lasso Regression,476133.6072,1199129159837.4006,1087041.6379,0.054,3.2411,1213.0755,0.149
ridge,Ridge Regression,476133.4439,1199128604467.48,1087041.4072,0.054,3.2411,1213.0661,0.127
en,Elastic Net,476049.0205,1199620900722.998,1087283.7322,0.0536,3.242,1187.4427,0.029


Executing shutdown due to inactivity...


2023-01-11 07:01:20,069 - INFO     - Executing shutdown due to inactivity...


Executing shutdown...


2023-01-11 07:01:20,151 - INFO     - Executing shutdown...


Exception on /shutdown [GET]
Traceback (most recent call last):
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 2073, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 1519, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 1517, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 1503, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/dtale/app.py", line 435, in shutdown
    shutdown_server()
  File "/usr/local/anaconda3/envs/who-ears-social

2023-01-11 07:01:20,153 - ERROR    - Exception on /shutdown [GET]
Traceback (most recent call last):
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 2073, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 1519, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 1517, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/flask/app.py", line 1503, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "/usr/local/anaconda3/envs/who-ears-social-listening/lib/python3.10/site-packages/dtale/app.py", line 435, in shutdown
    shutdown_server()
  File "/usr