In [1]:
%matplotlib notebook
import matplotlib
import seaborn as sb
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Jupyter Specifics
%matplotlib inline
from IPython.display import display, HTML
from ipywidgets.widgets import interact, interactive, IntSlider, FloatSlider, Layout, ToggleButton, ToggleButtons, fixed
display(HTML("<style>.container { width:100% !important; }</style>"))
style = {'description_width': '100px'}
slider_layout = Layout(width='99%')

from time import time
import pickle as pk

# Cluster data setup

In [2]:
from Cluster import *

In [3]:
bd = BaseData('data_all_base')

reading in data from data_all_base ...
elapsed:  7.325807094573975


In [4]:
ClData=ClusterData(bd,clusdtype='JRP1',cluster_data=False,report_correct=True,database='JHU',daysync=22,thresh=10,
                 mindays=200, mindeaths=200,mindeathspm=0.1,syncat='first major peak',K=2)

Constructing common synchronized deaths, case and testing data...
database JHU report_correct True
mindeaths 200 mindeathspm 0.1
database JHU report correction True
daysync 22 thresh for deaths 10 mindays 200
No of big common countries is 116
---------------------------------
number of countries in total_deaths) 197
number of countries in big 115
synchronizing and trimming time series to common length...
minfirstpeak 21 max possible length 298
making cases with nonlinear testing adjustment...
done.
----------------------------------------
Finished loading Cluster module
----------------------------------------


In [5]:
exec(open('ClusterFit.py','r').read())

In [6]:
print(len(bd.countries_jhu_4_owid),len(bd.countries_jhu_2_owid),len(bd.countries_owid),len(bd.countries_jhu))
print('countries in common: owid format')
print(bd.countries_jhu_2_owid)
print('')
print('owid countries not in common set')
print(set(bd.countries_owid)-set(bd.countries_jhu_2_owid))
print('')
print('countries in common: jhu format')
print(bd.countries_owid_to_jhu)
print('')
print(len(ClData.bcountries),'bcountries',ClData.bcountries)

190 190 197 192
countries in common: owid format
['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Myanmar', 'Burundi', 'Cape Verde', 'Cambodia', 'Cameroon', 'Central African Republic', 'Chad', 'Chile', 'Colombia', 'Comoros', 'Congo', 'Democratic Republic of Congo', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Vatican', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia',

In [7]:
clusdata_all = ClData.clusdata_all
bcountries = ClData.bcountries
datasets = [c for c in clusdata_all]
datasets

['deaths',
 'cases',
 'cases_lin2020',
 'cases_pwlfit',
 'cases_nonlin',
 'cases_nonlinr']

In [8]:
d_countries = [c for c in clusdata_all['deaths']]
c_countries = [c for c in clusdata_all['cases']]
lc_countries = [c for c in clusdata_all['cases_lin2020']]
pc_countries = [c for c in clusdata_all['cases_pwlfit']]
nc_countries = [c for c in clusdata_all['cases_nonlin']]

countries = d_countries

In [9]:
print(len(d_countries))
print(np.sort(d_countries))

85
['Afghanistan' 'Africa' 'Albania' 'Algeria' 'Angola' 'Armenia' 'Asia'
 'Australia' 'Austria' 'Bahrain' 'Bangladesh' 'Belarus' 'Belgium'
 'Bosnia and Herzegovina' 'Cameroon' 'Canada' 'Chile' 'Colombia' 'Czechia'
 'Denmark' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Europe'
 'Finland' 'France' 'Germany' 'Ghana' 'Guatemala' 'Haiti' 'Honduras'
 'Hungary' 'Indonesia' 'Iran' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan'
 'Kenya' 'Kosovo' 'Kuwait' 'Latvia' 'Lebanon' 'Lithuania' 'Luxembourg'
 'Madagascar' 'Malaysia' 'Middle_East' 'Moldova' 'Morocco' 'Netherlands'
 'North America' 'North Macedonia' 'Norway' 'Oceania' 'Oman' 'Pakistan'
 'Panama' 'Peru' 'Philippines' 'Portugal' 'Qatar' 'Romania' 'Russia'
 'Saudi Arabia' 'Senegal' 'Serbia' 'Slovenia' 'South Korea' 'Spain'
 'Sudan' 'Sweden' 'Switzerland' 'Syria' 'Turkey' 'Ukraine'
 'United Arab Emirates' 'United Kingdom' 'United States' 'Uzbekistan'
 'Venezuela' 'World' 'Yemen']


In [10]:
# check that all country sets being used are the same and check time series lengths and starting dates
# 79 countries with Oct 27 finish and with mindeaths=100 and mindays=150 and mindeathspm = 0.5
countrysets = [d_countries,c_countries,lc_countries,pc_countries,nc_countries]
print([len(ccs) for ccs in countrysets])
for ccs1 in countrysets:
    print([ccs1 == ccs2 for ccs2 in countrysets])
print([len(clusdata_all[d1]['United States']) for d1 in datasets])
# print(len(total_deaths_x['dates']),len(total_cases_x['dates']),len(testing_x['dates']),total_deaths_x['dates'][0],total_cases_x['dates'][0],testing_x['dates'][0])

[85, 85, 85, 85, 85]
[True, True, True, True, True]
[True, True, True, True, True]
[True, True, True, True, True]
[True, True, True, True, True]
[True, True, True, True, True]
[200, 200, 200, 200, 200, 200]


In [11]:
bd.covid_owid[0].keys()

odict_keys(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'handwashing_fac

In [12]:
bd.covid_ts.keys()

dict_keys(['confirmed', 'deaths', 'recovered', 'new_deaths', 'new_deaths_smoothed', 'deaths_smoothed', 'new_deaths_corrected', 'new_deaths_corrected_smoothed', 'deaths_corrected_smoothed', 'new_confirmed', 'new_confirmed_smoothed', 'confirmed_smoothed', 'new_confirmed_corrected', 'new_confirmed_corrected_smoothed', 'confirmed_corrected_smoothed', 'new_confirmed_linr_corrected_smoothed', 'confirmed_linr_corrected_smoothed', 'new_confirmed_nonlin_corrected_smoothed', 'confirmed_nonlin_corrected_smoothed', 'new_confirmed_nonlinr_corrected_smoothed', 'confirmed_nonlinr_corrected_smoothed'])

In [13]:
bd.covid_owid_ts.keys()

dict_keys(['confirmed', 'deaths', 'recovered', 'tests', 'stringency', 'population', 'population_density', 'gdp_per_capita', 'new_deaths', 'new_deaths_smoothed', 'deaths_smoothed', 'new_deaths_corrected', 'new_deaths_corrected_smoothed', 'deaths_corrected_smoothed', 'new_confirmed', 'new_confirmed_smoothed', 'confirmed_smoothed', 'new_confirmed_corrected', 'new_confirmed_corrected_smoothed', 'confirmed_corrected_smoothed', 'new_confirmed_linr_corrected_smoothed', 'confirmed_linr_corrected_smoothed', 'new_confirmed_nonlin_corrected_smoothed', 'confirmed_nonlin_corrected_smoothed', 'new_confirmed_nonlinr_corrected_smoothed', 'confirmed_nonlinr_corrected_smoothed'])

# Data save

Execute this section once to produce file `data_cluster_... .pk`.

In [14]:
allnmsc = ['ClData']
data_allc = {nm:eval(nm) for nm in allnmsc}

In [15]:
start = time()
pk.dump(data_allc,open('./pks/data_cluster_'+ClData.clusdtype+'.pk','wb'))
print('elapsed: ',time()-start)

elapsed:  0.34596705436706543


In [16]:
[x for x in dir(ClData) if '__' not in x]

['CaCo',
 'K',
 'bcountries',
 'bcountries_1',
 'big',
 'big_cases',
 'big_testing_c',
 'cases_adj_lin2020',
 'cases_adj_nonlin',
 'cases_adj_nonlinr',
 'cases_adj_pwlfit',
 'cases_raw',
 'clusdata_all',
 'clusdata_len',
 'clusdtype',
 'cluster_data',
 'cluster_data_loaded',
 'countries_common',
 'daily_deaths',
 'database',
 'datasets',
 'daysync',
 'deaths_raw',
 'first_peak',
 'first_thresh',
 'lccountries',
 'lcountries',
 'longshort',
 'longshort_c',
 'longshort_cases',
 'longshort_cases_c',
 'longshort_cases_est',
 'longshort_reg_testing_c',
 'longshort_testing_c',
 'longshortest',
 'make_cases_adj_nonlin',
 'mindays',
 'mindeaths',
 'mindeathspm',
 'minfirstpeak',
 'new_cases_spm',
 'new_deaths_spm',
 'reg_testing_lc',
 'regtests',
 'report_correct',
 'short_cases',
 'short_cases_c',
 'short_cases_est',
 'short_deaths',
 'short_deaths_c',
 'short_deaths_est',
 'short_reg_testing',
 'short_reg_testing_c',
 'short_reg_testing_est',
 'short_testing',
 'short_testing_c',
 'short_tes

# Data Load

Use this code to read in the data, e.g. at the top of another notebook, as an alternative to loading data.py or Cluster.py

In [17]:
# read in cluster data, cell can be used in other notebooks, but then
# the rhs of following line needs to be replaced by suffix of cluster data name (created when cluster data constructed)
clusdtype = ClData.clusdtype
# clusdtype = 'JRP1'  # e.g. for first version   

start=time()
print('reading in data...')
with open('./pks/data_cluster_'+ClData.clusdtype+'.pk','rb') as fp:
    foo = pk.load(fp)
print('elapsed: ',time()-start)

# make each element of the dictionary a global variable named with key:
for x in foo:
    stmp = x+"= foo['"+x+"']"
    exec(stmp)

reading in data...
elapsed:  0.04384875297546387
