# Tables

This notebook creates all tables included in the publication

In [2]:
import pickle
import sys
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import pandas as pd
import datetime
import matplotlib.ticker as ticker
sys.path.append("../../")
sys.path.append("../../covid19_inference")
sys.path.append("../")

import covid19_soccer
from covid19_soccer.plot.utils import get_from_trace
import covid19_inference as cov19
from header_plotting import *

In [3]:
%load_ext autoreload
%autoreload 2

In [17]:
def load(fstr):
    with open(fstr, "rb") as f:
         return pickle.load(f)

countries = ["England","Czechia","Italy","Scotland","Spain","Germany","France","Slovakia","Austria","Belgium","Portugal","Netherlands"]
traces, models, dls = [], [], []
for country in tqdm(countries):
    #'UEFA-beta=False-country=England-offset_games=0-draw_delay=True-weighted_alpha_prior=0-prior_delay=-1-width_delay_prior=0.1-sigma_incubation=-1.0-median_width_delay=1.0-tune=200-draws=300-max_treedepth=10.pickled'
    model = None
    fstr=lambda tune, draws, max_treedepth: (f"/data.nst/smohr/covid19_soccer_data/main_traces/"+
        f"-beta=False"+
        f"-country={country}"+
        f"-offset_data=0"+
        f"-prior_delay=-1"+
        f"-width_delay_prior=0.1"+
        f"-sigma_incubation=-1.0"+
        f"-median_width_delay=1.0"+
        f"-interval_cps=10.0"+
        f"-f_fem=0.2"+
        f"-uc=True"
        f"-len=normal"+                    
        f"-t={tune}"+
        f"-d={draws}"+
        f"-max_treedepth={max_treedepth}.pkl")
    
    dl = covid19_soccer.dataloader.Dataloader_gender(countries=[country])
    dls.append(dl)
    #print(fstr(4000, 8000, 12))
    if os.path.exists(fstr(4000, 8000, 12)):
        try:
            model, trace = load(fstr(4000, 8000, 12))
            print(f"Use 8000 sample runs for {country}")
        except:
            pass
    if model is None and os.path.exists(fstr(2000, 4000, 12)):
        try:
            model, trace = load(fstr(2000, 4000, 12))
            print(f"Use 4000 sample runs for {country}")
        except:
            pass
    if model is None and os.path.exists(fstr(1000, 1500, 12)):
        try: 
            model, trace = load(fstr(1000, 1500, 12))
            print(f"Use 1500 sample runs for {country}")
        except:
            pass
    if model is None:
        print(" not found")
        continue
    
    # Remove chains with likelihood larger than -200, should only be the case for 2 chains in France
    mask = (np.mean(trace.sample_stats.lp, axis=1)>-200)
    trace.posterior = trace.posterior.sel(chain=~mask)
    
    models.append(model)
    traces.append(trace)
    

  0%|          | 0/12 [00:00<?, ?it/s]

 not found
 not found
 not found
 not found
 not found
 not found
 not found
 not found
 not found
 not found
 not found
 not found


In [29]:
infections_no = []
infections_primary = []
infections_secondary = []
for i, country in tqdm(enumerate(countries)):
    trace_without_soccer, trace_primary_soccer = load(f"/data.nst/smohr/covid19_soccer_data/primary_and_subsequent/{country}.pkl")
    shape = trace_without_soccer.predictions["new_cases"].to_numpy().shape
    x = pd.date_range(models[i].sim_begin, models[i].sim_end)
    y0 = trace_without_soccer.predictions["new_cases"].to_numpy().reshape(shape[0]*shape[1]*shape[2],shape[3],shape[4])
    y1 = trace_primary_soccer.predictions["new_cases"].to_numpy().reshape(shape[0]*shape[1]*shape[2],shape[3],shape[4]) - y0
    y2 = traces[i].posterior["new_cases"].to_numpy().reshape(shape[0]*shape[1]*shape[2],shape[3],shape[4]) - y1 - y0

    begin = datetime.datetime(2021, 6, 11)
    end = datetime.datetime(2021, 7, 31)
    i_begin = (begin - model.sim_begin).days
    i_end = (end - model.sim_begin).days + 1  # inclusiv last day

    y0 = np.sum(y0[..., i_begin:i_end, :], axis=-2)
    y1 = np.sum(y1[..., i_begin:i_end, :], axis=-2)
    y2 = np.sum(y2[..., i_begin:i_end, :], axis=-2)

    infections_no.append(y0)
    infections_primary.append(y1)
    infections_secondary.append(y2)

0it [00:00, ?it/s]

In [30]:
data_primary  = pd.DataFrame()
means = []
for i,country in enumerate(countries):
    # compute fraction of infected people
    t_numpy = infections_primary[i] / (infections_no[i] + infections_primary[i] + infections_secondary[i]) * 100

    male = np.stack(
        (t_numpy[:, 0], np.zeros(t_numpy[:, 0].shape)), axis=1
    )
    female = np.stack(
        (t_numpy[:, 1], np.ones(t_numpy[:, 1].shape)), axis=1
    )
    # Create dataframe for plotting
    temp = pd.DataFrame(np.concatenate((male, female)), columns=["percentage_primary", "gender"])
    temp["gender"] = pd.cut(
        temp["gender"], bins=[-1, 0.5, 1], labels=["male", "female"]
    )
    temp["country"] = country
    data_primary = pd.concat([data_primary, temp])
    means.append(np.mean(temp["percentage_primary"]))
country_order_primar = np.argsort(means)[::-1]

data_primary_and_subsequent = pd.DataFrame()
means = []
for i, country in enumerate(countries):

    # compute fraction
    t_numpy = (infections_primary[i]+infections_secondary[i])/(infections_no[i] + infections_primary[i] + infections_secondary[i]) * 100

    male = np.stack(
        (t_numpy[:, 0], np.zeros(t_numpy[:, 0].shape)), axis=1
    )
    female = np.stack(
        (t_numpy[:, 1], np.ones(t_numpy[:, 1].shape)), axis=1
    )
    # Create dataframe for plotting
    temp = pd.DataFrame(np.concatenate((male, female)), columns=["percentage_primary_and_subsequent", "gender"])
    temp["gender"] = pd.cut(
        temp["gender"], bins=[-1, 0.5, 1], labels=["male", "female"]
    )
    temp["country"] = country
    data_primary_and_subsequent = pd.concat([data_primary_and_subsequent, temp])
    means.append(np.mean(temp["percentage_primary_and_subsequent"]))
country_order_primary_and_subsequent = np.argsort(means)[::-1]

In [31]:
# Compute interesting values as fraction and as incidence
primary_frac = []
for i, country in enumerate(countries):
    temp = data_primary[data_primary["country"] == country].drop(columns=["country","gender"])
    primary_frac.append(temp.to_numpy()[:,0])


primary_and_subsequent_frac = []
for i, country in enumerate(countries):
    temp = data_primary_and_subsequent[data_primary_and_subsequent["country"] == country].drop(columns=["country","gender"])
    primary_and_subsequent_frac.append(temp.to_numpy()[:,0])


primary_inci = []
for i, country in enumerate(countries):
    temp = infections_primary[i]
    primary_inci.append(temp / dls[i].population[0]*1e6)

primary_and_subsequent_inci = []
for i, country in enumerate(countries):
    temp = infections_primary[i] + infections_secondary[i]
    primary_and_subsequent_inci.append(temp/ dls[i].population[0]*1e6)

primary_cases = []
for i, country in enumerate(countries):
    temp = infections_primary[i]
    primary_cases.append(temp)

primary_and_subsequent_cases = []
for i, country in enumerate(countries):
    temp = infections_primary[i] + infections_secondary[i]
    primary_and_subsequent_cases.append(temp)
    


## SI Table 1: Fractions

In [32]:
table = []
for i, country in enumerate(countries):
    # Primary infections
    data  = data_primary[data_primary["country"] == country]

    CI = np.percentile(data["percentage_primary"], q = (2.5,50,97.5))
    prob_positive = np.sum(data["percentage_primary"] > 0) / data["percentage_primary"].shape[0] * 100

    # Secondary infections and primary
    data  = data_primary_and_subsequent[data_primary_and_subsequent["country"] == country]
    CI_sub = np.percentile(data["percentage_primary_and_subsequent"], q = (2.5,50,97.5))
    
    # Create table entries
    if prob_positive > 99.9:
        text_prob_pos = "$> 99.9$\%"
    else:
        text_prob_pos = f"{prob_positive:.1f}\%"

    table.append([country, f"{CI[1]:.1f}\% [{CI[0]:.1f}\%, {CI[2]:.1f}\%]", text_prob_pos,
              f"{CI_sub[1]:.1f}\% [{CI_sub[0]:.1f}\%, {CI_sub[2]:.1f}\%]"])

In [33]:
from tabulate import tabulate
headers = ['Country', 'median fraction soccer related cases', 'probability soccer increased cases', "fraction secondary infections until 31.07.2021"]
print(tabulate(table, headers, tablefmt="latex_raw"))

\begin{tabular}{llll}
\hline
 Country     & median fraction soccer related cases   & probability soccer increased cases   & fraction secondary infections until 31.07.2021   \\
\hline
 England     & 9.2\% [4.4\%, 16.8\%]                  & $> 99.9$\%                           & 41.8\% [29.3\%, 53.9\%]                          \\
 Czechia     & 4.3\% [0.4\%, 13.9\%]                  & 98.9\%                               & 39.3\% [5.3\%, 63.7\%]                           \\
 Italy       & 2.3\% [-2.6\%, 12.4\%]                 & 87.0\%                               & 31.4\% [-95.3\%, 74.0\%]                         \\
 Scotland    & 4.2\% [1.4\%, 10.2\%]                  & $> 99.9$\%                           & 43.3\% [32.9\%, 53.2\%]                          \\
 Spain       & 1.0\% [-0.3\%, 4.0\%]                  & 94.6\%                               & 16.1\% [-9.1\%, 33.0\%]                          \\
 Germany     & 0.5\% [-1.4\%, 3.4\%]                  & 79.5\%                    

## SI Table 2: Totals

In [36]:
table_2 = []
for i, country in enumerate(countries):
    
    c = country_order_primar[i]

    row = []
    
    primary_mean, primary_lower, primary_upper = np.percentile(primary_inci[c], q = (50,2.5,97.5), axis=0)
    subs_mean, subs_lower, subs_upper = np.percentile(primary_and_subsequent_inci[c], q = (50,2.5,97.5), axis=0)
    
    row.append(country)
    for m,l,u in zip(primary_mean, primary_lower, primary_upper):
        row.append(f"{m:.0f} [{l:.0f}, {u:.0f}]")
    for m,l,u in zip(subs_mean, subs_lower, subs_upper):
        row.append(f"{m:.0f} [{l:.0f}, {u:.0f}]")    
    
    table_2.append(row)

In [37]:
from tabulate import tabulate
headers = ['Country', 'Primary cases per $10^6$ inhibitants (male)', 'Primary cases per $10^6$ inhibitants (female)', "Primary and subsequent cases per $10^6$ inhibitants (male)","Primary and subsequent cases per $10^6$ inhibitants (female)"]
print(tabulate(table_2, headers, tablefmt="latex_raw"))

\begin{tabular}{lllll}
\hline
 Country     & Primary cases per $10^6$ inhibitants (male)   & Primary cases per $10^6$ inhibitants (female)   & Primary and subsequent cases per $10^6$ inhibitants (male)   & Primary and subsequent cases per $10^6$ inhibitants (female)   \\
\hline
 England     & 3091 [2174, 4092]                             & 1342 [890, 1972]                                & 10326 [7438, 12919]                                          & 8568 [6091, 11004]                                             \\
 Czechia     & 1760 [1342, 2191]                             & 405 [253, 609]                                  & 9623 [7924, 11293]                                           & 7616 [6052, 9272]                                              \\
 Italy       & 59 [9, 119]                                   & 17 [2, 52]                                      & 320 [45, 513]                                                & 287 [37, 466]                                                

In [38]:
table_2_5 = []
owd = cov19.data_retrieval.OWD(True)

for i, country in enumerate(countries):
    row = []
    c = country_order_primar[i]
    
    if country in ["Scotland","England"]:
        df = owd.data[owd.data["country"] == "United Kingdom"]
    else:
        df = owd.data[owd.data["country"] == country]
    df["ifr"] = df["new_deaths"]/df["new_cases"]

    primary_mean, primary_lower, primary_upper = np.percentile(primary_cases[c], q = (50,2.5,97.5), axis=0)
    subs_mean, subs_lower, subs_upper = np.percentile(np.sum(primary_and_subsequent_cases[c],axis=-1), q = (50,2.5,97.5), axis=0)
    fat_mean, fat_lower, fat_upper = np.percentile(np.sum(primary_and_subsequent_cases[c],axis=-1)*df["ifr"][begin:end].mean(), q = (50,2.5,97.5), axis=0)
    
    row.append(country)
    for m,l,u in zip(primary_mean, primary_lower, primary_upper):
        row.append(f"{m:.0f} [{l:.0f}, {u:.0f}]")

    row.append(f"{subs_mean:.0f} [{subs_lower:.0f}, {subs_upper:.0f}]")    
    
    # Fatility
    row.append(f"{fat_mean:.0f} [{fat_lower:.0f}, {fat_upper:.0f}]")
    
    table_2_5.append(row)
    
def reduce_samples(curr_list):
    number_of_samples = 2000
    for i, array in enumerate(curr_list):
        array = array[np.random.choice(array.shape[0], number_of_samples, replace = False),...]
        curr_list[i] = array
    return np.array(curr_list)

row= []
pc = reduce_samples(primary_cases)
psc = reduce_samples(primary_and_subsequent_cases)

primary_mean, primary_lower, primary_upper = np.percentile(np.sum(pc,axis=0), q = (50,2.5,97.5), axis=0)
subs_mean, subs_lower, subs_upper = np.percentile(np.sum(psc,axis=(0,-1)), q = (50,2.5,97.5), axis=0)
fat_mean, fat_lower, fat_upper = np.percentile(np.sum(psc,axis=(0,-1))*0.002, q = (50,2.5,97.5), axis=0)

row.append("Total")
for m,l,u in zip(primary_mean, primary_lower, primary_upper):
    row.append(f"{m:.0f} [{l:.0f}, {u:.0f}]")

row.append(f"{subs_mean:.0f} [{subs_lower:.0f}, {subs_upper:.0f}]")    

# Fatility
row.append(f"{fat_mean:.0f} [{fat_lower:.0f}, {fat_upper:.0f}]")

table_2_5.append(row)

INFO     [covid19_inference.data_retrieval.retrieval] Successfully loaded OurWorldinData.csv.gz from /tmp/covid19_data/, skipping download.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ifr"] = df["new_deaths"]/df["new_cases"]
INFO     [covid19_inference.data_retrieval.retrieval] Successfully loaded OurWorldinData.csv.gz from /tmp/covid19_data/, skipping download.
INFO     [covid19_inference.data_retrieval.retrieval] Successfully loaded OurWorldinData.csv.gz from /tmp/covid19_data/, skipping download.
INFO     [covid19_inference.data_retrieval.retrieval] Successfully loaded OurWorldinData.csv.gz from /tmp/covid19_data/, skipping download.
INFO     [covid19_inference.data_retrieval.retrieval] Successfully loaded OurWorldinData.csv.gz from /tmp/covid19_data/, skipping 

In [40]:
from tabulate import tabulate
headers = ['Country', 'Primary cases (male)', 'Primary cases (female)', "Primary and subsequent cases", "Number of associated deaths"]
print(tabulate(table_2_5, headers, tablefmt="latex_raw"))

\begin{tabular}{lllll}
\hline
 Country     & Primary cases (male)   & Primary cases (female)   & Primary and subsequent cases   & Number of associated deaths   \\
\hline
 England     & 80573 [56679, 106686]  & 34982 [23204, 51410]     & 492559 [353465, 623793]        & 671 [481, 849]                \\
 Czechia     & 4517 [3446, 5625]      & 1040 [649, 1564]         & 44246 [36001, 52726]           & 883 [718, 1052]               \\
 Italy       & 308 [47, 628]          & 89 [11, 275]             & 3193 [432, 5144]               & 69 [9, 111]                   \\
 Scotland    & 2702 [-2150, 7773]     & 674 [-575, 2387]         & 32536 [-98310, 76318]          & 44 [-134, 104]                \\
 Spain       & 8465 [-2648, 17197]    & 2153 [-725, 6200]        & 125910 [-70945, 256619]        & 378 [-213, 770]               \\
 Germany     & 379 [-647, 1235]       & 100 [-197, 414]          & 8163 [-17272, 22823]           & 384 [-812, 1073]              \\
 France      & 12 [-92, 89]     

## SI Table 3: Number of games hosted and played

In [27]:
table_3 = []
df = pd.read_csv("../../data/em_game_data.csv",header=2)
df = df[~df["id"].str.contains("a")] # Filter extra games we added for validation these are suffixed with a


country2location = {
    "GB-ENG":"London",
    "IT":"Rome",
    "AZ":"Baku",
    "DE":"Munich",
    "RU":"Saint Petersburg",
    "HU":"Budapest",
    "ES":"Seville",
    "RO":"Bucharest",
    "NL":"Amsterdam",
    "GB-SCT":"Glasgow",
    "DK":"Copenhagen"
}
country_order_primar = list(range(len(countries)))
for i, country in enumerate(np.array(countries)):
    c = country_order_primar[i]
    iso2 = dls[c].countries_iso2[0]
    gamesByTeam = df[df[' team1'].str.contains(iso2) | df[' team2'].str.contains(iso2)] 
    
    
    if iso2 in country2location:
        location = country2location[iso2]
        gamesHosted = df[df[' location'].str.contains(location)]
    else:
        location = "placeholder"
        gamesHosted = np.zeros((0,8))
    
    union = df[df[' team1'].str.contains(iso2) | df[' team2'].str.contains(iso2) |  df[' location'].str.contains(location)]
    row = []
    row.append(country)
    row.append(gamesByTeam.shape[0])
    row.append(gamesHosted.shape[0])
    row.append(union.shape[0])
    
    table_3.append(row)

In [28]:
from tabulate import tabulate
headers = ['Country', 'Matches played', 'Matches hosted',"Union"]
print(tabulate(table_3, headers, tablefmt="latex_raw"))

\begin{tabular}{lrrr}
\hline
 Country     &   Matches played &   Matches hosted &   Union \\
\hline
 England     &                7 &                8 &       9 \\
 Czechia     &                5 &                0 &       5 \\
 Italy       &                7 &                4 &       8 \\
 Scotland    &                3 &                4 &       5 \\
 Spain       &                6 &                4 &       7 \\
 Germany     &                4 &                4 &       5 \\
 France      &                4 &                0 &       4 \\
 Slovakia    &                3 &                0 &       3 \\
 Austria     &                4 &                0 &       4 \\
 Belgium     &                5 &                0 &       5 \\
 Portugal    &                4 &                0 &       4 \\
 Netherlands &                4 &                4 &       5 \\
\hline
\end{tabular}
