Since our data vaccination data is somewhat incomplete lets see if we can boost the model with some sythetic data. 

In [None]:
from pycaret.regression import *

# importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import folium 
from folium import plugins

# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load dataframe for feature analysis
combined_data = catalog.load("merge_who_ears_owid_data")
combined_data['date'] = pd.to_datetime(combined_data['date'], format = '%Y-%m-%d')
# combined_data["date"] = pd.to_datetime(combined_data["date"])


In [None]:
features = combined_data[['date', 'mis_and_disinformation', 'mis_and_disinformation_complaints', 
                          'mis_and_disinformation_delta', 'mis_and_disinformation_male', 
                          'mis_and_disinformation_percent', 'mis_and_disinformation_questions', 
                          'mis_and_disinformation_female',
                         'myths', 'myths_complaints', 'myths_delta', 'myths_female', 
                          'myths_male', 'myths_percent', 'myths_questions','new_vaccinations']]

features = features.dropna()

In [None]:
# lets setup our first session for eval
# we chose total cases since its a complete set AND is has high correlation with the other features (Pearson)
session_2 = setup(features, target = 'new_vaccinations', 
                  session_id=12, 
                  log_experiment=False, 
                  experiment_name='new_vaccinations_1')

In [None]:
# We can now compare all the regression models available 
best_model = compare_models()

And now for some synthetic data creation with dGAN https://github.com/gretelai/gretel-synthetics/blob/master/examples/timeseries_dgan.ipynb

In [None]:
#Uncomment to install ydata-synthetic lib
!pip install gretel-synthetics --upgrade
!pip install torch numpy pandas matplotlib scikit-learn

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as md

import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType


In [None]:
# Plot the 4 columns over the 4+ months of data
for c in features.columns:
    if c in ['myths', 'mis_and_disinformation', 'new_vaccinations']:
        plt.plot(features["date"], features[c], label=c)
plt.xticks(rotation=90)
plt.legend()
plt.ylabel("Feature Values")
plt.xlabel("Date")
plt.show()

In [None]:
print(features.shape)
# DGAN needs many example time series to train. Split into 1-day slices to
# create multiple examples.
features = features.drop(columns="date").to_numpy()
# Obsevations every 10 minutes, so 144 * 10 minutes = 1 day
n = features.shape[0]
features = features[:(n),:].reshape(-1, 1, features.shape[1])
# Shape is now (# examples, # time points, # features)
print(features.shape)

In [None]:
features.shape

In [None]:

# Train DGAN model
model = DGAN(DGANConfig(
    max_sequence_len=features.shape[1],
    sample_len=1,
    batch_size=min(1000, features.shape[0]),
    apply_feature_scaling=True,
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=10000,
))

model.train_numpy(
    features,
    feature_types=[OutputType.CONTINUOUS] * features.shape[2],
)

# Generate synthetic data
_, synthetic_features = model.generate_numpy(20000)


In [None]:
_, synthetic_features = model.generate_numpy(20000)



In [None]:
# Compare distribution of T_out values
plt.hist([features[:,:,3].flatten(), synthetic_features[:,:,3].flatten()], 
         label=["real", "synthetic"],
         bins=25,
         density=True)
plt.legend()
plt.xlabel("Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of 1-step (10 minute) diffs for T_out
real_diffs = np.diff(features, axis=1)
synthetic_diffs = np.diff(synthetic_features, axis=1)

plt.hist([real_diffs[:,:,3].flatten(), synthetic_diffs[:,:,3].flatten()],
         label=["real", "synthetic"],
         bins=25,
         density=True)
plt.legend()
plt.xlabel("value change")
plt.ylabel("Density")
plt.show()


In [None]:
# pd.DataFrame(synthetic_features).to_csv('sample.csv')
model.save("synthetic_model")


In [None]:
# Lets make a return to the 2-d world by squeezeing our 3d out 
df = pd.DataFrame(np.squeeze(synthetic_features))



In [None]:
df.to_csv('sample.csv')

In [None]:
features.to_csv('original_set.csv')

In [None]:
import datetime
# Add a column of numbers to the dataframe
df['numbers'] = range(20000)

# Get today's date
today = datetime.datetime.today()

# Generate a range of dates starting from today and ending today+9 days.
date_list = pd.date_range(today, periods=20000)

# Add date column to the dataframe
df = df.assign(date=date_list)

In [None]:
dtale.show(df)

In [None]:
df.columns

In [None]:
col_names = ['mis_and_disinformation', 'mis_and_disinformation_complaints', 
 'mis_and_disinformation_delta', 'mis_and_disinformation_male', 
 'mis_and_disinformation_percent', 'mis_and_disinformation_questions', 
 'mis_and_disinformation_female',
                         'myths', 'myths_complaints', 'myths_delta', 'myths_female', 
 'myths_male', 'myths_percent', 'myths_questions','new_vaccinations']

# rename the columns 
df = df.rename(columns = dict(zip(df.columns,col_names)))

In [None]:
df.drop(columns=['numbers'])

In [None]:
result_df = features.append(df)

In [None]:
# lets setup our first session for eval
# we chose total cases since its a complete set AND is has high correlation with the other features (Pearson)
session_2 = setup(result_df, target = 'new_vaccinations', 
                  session_id=12, 
                  log_experiment=False, 
                  experiment_name='new_vaccinations_1')

In [None]:
best_model = compare_models()