In [1]:
%matplotlib notebook
import matplotlib
import seaborn as sb
from matplotlib import pyplot as plt
import holoviews as hv

import numpy as np
import pandas as pd
import pickle as pk
from time import time

# Jupyter Specifics
%matplotlib inline
from IPython.display import display, HTML
from ipywidgets.widgets import interact, interactive, IntSlider, FloatSlider, Layout, ToggleButton, ToggleButtons, fixed
display(HTML("<style>.container { width:100% !important; }</style>"))
style = {'description_width': '100px'}
slider_layout = Layout(width='99%')


In [2]:
# read in data
start=time()
print('reading in data...')
with open('data_all.pk','rb') as fp:
    foo = pk.load(fp)
print('elapsed: ',time()-start)

for x in foo:
    stmp = x+"= foo['"+x+"']"
    exec(stmp)

reading in data...
elapsed:  5.15117883682251


In [None]:
plt.plot(clusdata_all['deaths']['Germany']);

In [None]:
plt.plot(new_cases_c_nonlinr_owid['Germany']);

# Consensus basics

In [None]:
# from Consensus import *
# NB:  scoping problem seems to prevent import from working properly.
# instead use exec below.

In [None]:
exec(open('Consensus.py','r').read())

In [None]:
#foo = Consensus(cases=['deaths','cases','cases_nonlinr'])
foo = Consensus()

In [None]:
cases

In [None]:
# import warnings
# warnings.simplefilter('error', RuntimeWarning)   # to replace warnings by errors to allow traceback
# warnings.simplefilter('error',DeprecationWarning)

In [None]:
# loop over 24 scans: 4* cases index + scoring method (max validity or min score options) 0 to 3:
# 0 validity = max(validity,0.001)
# 1 validitysc = rescale(validity,ncomp)  # rescaled to account for biasses in number of components of projection
# 2 score1 = 1.0/validitysc + float(nunclustered)/5. + np.abs(float(nclus)-4.)/2.  # penalizes additively
# 3 score2 = float(nunclustered)*(4.+np.abs(nclus-4.))/(validitysc*20.)            # penalizes multiplicatively : all criteria required
foo.scan() # default diag=False for no output except progress bar

In [None]:
# plot of probability of countries being in cluster (blue) and 1- outlier score (red)
# extracted to paper figure in Assembled_Figs.pptx
foo.plot_outliers()

**Must do this to graph any of the clusterings...**

In [None]:
foo.make_clusters()

In [None]:
foo.plot_stage(3)

In [None]:
# the end result here is with the countries ordered by the refclustering column (the one with the best score2)
# see swizzle below to reorder countries by means of the consensus clusters
foo.plot_all_stages()

In [None]:
foo.swizzle(satthresh=0.6) # the closer the satthresh to one, the more countries end up in unclustered
print(len(foo.classes))
print(foo.classes)

In [None]:
classes1 = foo.classes

In [None]:
foo.plot_swiz()


In [None]:
foo.make_map()

In [None]:
foo.map

## restricted columns...

In [None]:
mycols = [i for i,r in enumerate(foo.report) if ('deaths,' in foo.report[i] or 'cases,' in foo.report[i] or 'cases_nonlinr,' in foo.report[i])]

In [None]:
mycols

In [None]:
foo.coldata.shape

In [None]:
fofo = np.array([foo.coldata[0:,ii,0:] for ii in mycols])
fofo.shape

In [None]:
foo.swizzle(cols=mycols)

In [None]:
classes2 = foo.classes

In [None]:
foo.plot_swiz()

In [None]:
foo.make_map()

In [None]:
foo.map

# Sankey diagram for comparing clusterings

In [None]:
foo = Consensus()
foo.scan()

In [None]:
dic1 = foo.swdic
classes1 = foo.classes

In [None]:
mycols = [i for i,r in enumerate(foo.report) if ('deaths,' in foo.report[i] or 'cases,' in foo.report[i] or 'cases_nonlinr,' in foo.report[i])]
print(mycols)
foo.swizzle(cols=mycols)
dic2 = foo.swdic
classes2 = foo.classes

In [None]:
hv.extension('bokeh')


In [None]:
df = pd.DataFrame(columns=['c1','c2','val'])
cnt=0
for k in dic1:
    adic[str(k)] = {}
    Nk = len(dic1[k])
    s1 = set(dic1[k])
    for kk in dic2:
        s2 = set(dic2[kk])
        #olap = len(s1.intersection(s2))/float(Nk)
        olap = len(s1.intersection(s2))
        if olap > 0:
            df.loc[cnt] = ['a'+str(k),'b'+str(kk),olap]
            cnt = cnt+1


In [None]:
df

In [None]:
%%opts Sankey (cmap='Category10' edge_color='c1' edge_line_width=0 node_alpha=1.0)
hv.Sankey(df,kdims=['c1','c2'],vdims=['val'],edge_color='c2')
                    

## compare with consensus built from fewer cases

In [None]:
foo = Consensus()
foo.scan()

dic1 = foo.swdic
classes1 = foo.classes

In [None]:
exec(open('Consensus.py','r').read())

### compare with `cases=['deaths','cases','cases_nonlinr'],satthresh = 0.5`

In [None]:
fooo = Consensus(cases=['deaths','cases','cases_nonlinr'],satthresh = 0.5)
fooo.scan()

dic2= fooo.swdic

In [None]:
df = pd.DataFrame(columns=['c1','c2','val'])
cnt=0
for k in dic1:
    adic[str(k)] = {}
    Nk = len(dic1[k])
    s1 = set(dic1[k])
    for kk in dic2:
        s2 = set(dic2[kk])
        #olap = len(s1.intersection(s2))/float(Nk)
        olap = len(s1.intersection(s2))
        if olap > 0:
            df.loc[cnt] = ['a'+str(k),'b'+str(kk),olap]
            cnt = cnt+1


In [None]:
%%opts Sankey (cmap='Category10' edge_color='c1' edge_line_width=0 node_alpha=1.0)
hv.Sankey(df,kdims=['c1','c2'],vdims=['val'],edge_color='c2')

### compare with `cases=['deaths','cases_nonlinr']`

In [None]:
fooo = Consensus(cases=['deaths','cases_nonlinr'])
fooo.scan()

dic2= fooo.swdic

In [None]:
df = pd.DataFrame(columns=['c1','c2','val'])
cnt=0
for k in dic1:
    adic[str(k)] = {}
    Nk = len(dic1[k])
    s1 = set(dic1[k])
    for kk in dic2:
        s2 = set(dic2[kk])
        #olap = len(s1.intersection(s2))/float(Nk)
        olap = len(s1.intersection(s2))
        if olap > 0:
            df.loc[cnt] = ['a'+str(k),'b'+str(kk),olap]
            cnt = cnt+1


In [None]:
%%opts Sankey (cmap='Category10' edge_color='c1' edge_line_width=0 node_alpha=1.0)
hv.Sankey(df,kdims=['c1','c2'],vdims=['val'],edge_color='c2')

### compare with `cases=['deaths','cases_nonlinr'],satthresh = 0.5`

In [None]:
fooo = Consensus(cases=['deaths','cases_nonlinr'],satthresh = 0.5)
fooo.scan()

dic2= fooo.swdic

In [None]:
df = pd.DataFrame(columns=['c1','c2','val'])
cnt=0
for k in dic1:
    adic[str(k)] = {}
    Nk = len(dic1[k])
    s1 = set(dic1[k])
    for kk in dic2:
        s2 = set(dic2[kk])
        #olap = len(s1.intersection(s2))/float(Nk)
        olap = len(s1.intersection(s2))
        if olap > 0:
            df.loc[cnt] = ['a'+str(k),'b'+str(kk),olap]
            cnt = cnt+1


In [None]:
%%opts Sankey (cmap='Category10' edge_color='c1' edge_line_width=0 node_alpha=1.0)
hv.Sankey(df,kdims=['c1','c2'],vdims=['val'],edge_color='c2')

In [None]:
fooo.make_map()

In [None]:
fooo.map