# Tables

This notebook creates all tables included in the publication

In [1]:
import pickle
import sys
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import pandas as pd
import datetime
import matplotlib.ticker as ticker
sys.path.append("../../")
sys.path.append("../../covid19_inference")
sys.path.append("../")

import covid19_soccer
from covid19_soccer.plot.utils import get_from_trace
import covid19_inference as cov19
from header_plotting import *

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def load(fstr):
    with open(fstr, "rb") as f:
         return pickle.load(f)

countries = ["England","Scotland","Germany","France","Spain","Slovakia","Portugal","Netherlands","Italy","Czechia","Belgium","Austria"]
traces, models, dls = [], [], []
for country in tqdm(countries):
    #'UEFA-beta=False-country=England-offset_games=0-draw_delay=True-weighted_alpha_prior=0-prior_delay=-1-width_delay_prior=0.1-sigma_incubation=-1.0-median_width_delay=1.0-tune=200-draws=300-max_treedepth=10.pickled'
    model = None
    fstr=lambda tune, draws, max_treedepth: (f"/data.nst/smohr/covid19_soccer_data/main_traces/"+
        f"-beta=False"+
        f"-country={country}"+
        f"-offset_data=0"+
        f"-prior_delay=-1"+
        f"-width_delay_prior=0.1"+
        f"-sigma_incubation=-1.0"+
        f"-median_width_delay=1.0"+
        f"-interval_cps=10.0"+
        f"-f_fem=0.2"+
        f"-uc=True"
        f"-len=normal"+                    
        f"-t={tune}"+
        f"-d={draws}"+
        f"-max_treedepth={max_treedepth}.pkl")
    #print(fstr(4000, 8000, 12))
    if os.path.exists(fstr(4000, 8000, 12)):
        try:
            model, trace = load(fstr(4000, 8000, 12))
            print(f"Use 8000 sample runs for {country}")
        except:
            pass
    if model is None and os.path.exists(fstr(2000, 4000, 12)):
        try:
            model, trace = load(fstr(2000, 4000, 12))
            print(f"Use 4000 sample runs for {country}")
        except:
            pass
    if model is None and os.path.exists(fstr(1000, 1500, 12)):
        try: 
            model, trace = load(fstr(1000, 1500, 12))
            print(f"Use 1500 sample runs for {country}")
        except:
            pass
    if model is None:
        print(" not found")
        continue
    
    # Remove chains with likelihood larger than -200, should only be the case for 2 chains in France
    mask = (np.mean(trace.sample_stats.lp, axis=1)>-200)
    trace.posterior = trace.posterior.sel(chain=~mask)
    
    dl = covid19_soccer.dataloader.Dataloader_gender(countries=[country])
    models.append(model)
    traces.append(trace)
    dls.append(dl)

  0%|          | 0/12 [00:00<?, ?it/s]

Use 4000 sample runs for England
Use 8000 sample runs for Scotland
Use 8000 sample runs for Germany
Use 4000 sample runs for France
Use 8000 sample runs for Spain
Use 8000 sample runs for Slovakia
Use 4000 sample runs for Portugal
Use 8000 sample runs for Netherlands
Use 8000 sample runs for Italy
Use 8000 sample runs for Czechia
Use 8000 sample runs for Belgium
Use 8000 sample runs for Austria


In [4]:
infections_no = []
infections_primary = []
infections_secondary = []
for i, country in tqdm(enumerate(countries)):
    trace_without_soccer, trace_primary_soccer = load(f"/data.nst/smohr/covid19_soccer_data/primary_and_subsequent/{country}.pkl")
    shape = trace_without_soccer.predictions["new_cases"].to_numpy().shape
    x = pd.date_range(models[i].sim_begin, models[i].sim_end)
    y0 = trace_without_soccer.predictions["new_cases"].to_numpy().reshape(shape[0]*shape[1]*shape[2],shape[3],shape[4])
    y1 = trace_primary_soccer.predictions["new_cases"].to_numpy().reshape(shape[0]*shape[1]*shape[2],shape[3],shape[4]) - y0
    y2 = traces[i].posterior["new_cases"].to_numpy().reshape(shape[0]*shape[1]*shape[2],shape[3],shape[4]) - y1 - y0

    begin = datetime.datetime(2021, 6, 11)
    end = datetime.datetime(2021, 7, 31)
    i_begin = (begin - model.sim_begin).days
    i_end = (end - model.sim_begin).days + 1  # inclusiv last day

    y0 = np.sum(y0[..., i_begin:i_end, :], axis=-2)
    y1 = np.sum(y1[..., i_begin:i_end, :], axis=-2)
    y2 = np.sum(y2[..., i_begin:i_end, :], axis=-2)

    infections_no.append(y0)
    infections_primary.append(y1)
    infections_secondary.append(y2)

0it [00:00, ?it/s]

In [5]:
data_primary  = pd.DataFrame()
means = []
for i,country in enumerate(countries):
    # compute fraction of infected people
    t_numpy = infections_primary[i] / (infections_no[i] + infections_primary[i] + infections_secondary[i]) * 100

    male = np.stack(
        (t_numpy[:, 0], np.zeros(t_numpy[:, 0].shape)), axis=1
    )
    female = np.stack(
        (t_numpy[:, 1], np.ones(t_numpy[:, 1].shape)), axis=1
    )
    # Create dataframe for plotting
    temp = pd.DataFrame(np.concatenate((male, female)), columns=["percentage_primary", "gender"])
    temp["gender"] = pd.cut(
        temp["gender"], bins=[-1, 0.5, 1], labels=["male", "female"]
    )
    temp["country"] = country
    data_primary = pd.concat([data_primary, temp])
    means.append(np.mean(temp["percentage_primary"]))
country_order_primar = np.argsort(means)[::-1]

data_primary_and_subsequent = pd.DataFrame()
means = []
for i, country in enumerate(countries):

    # compute fraction
    t_numpy = (infections_primary[i]+infections_secondary[i])/(infections_no[i] + infections_primary[i] + infections_secondary[i]) * 100

    male = np.stack(
        (t_numpy[:, 0], np.zeros(t_numpy[:, 0].shape)), axis=1
    )
    female = np.stack(
        (t_numpy[:, 1], np.ones(t_numpy[:, 1].shape)), axis=1
    )
    # Create dataframe for plotting
    temp = pd.DataFrame(np.concatenate((male, female)), columns=["percentage_primary_and_subsequent", "gender"])
    temp["gender"] = pd.cut(
        temp["gender"], bins=[-1, 0.5, 1], labels=["male", "female"]
    )
    temp["country"] = country
    data_primary_and_subsequent = pd.concat([data_primary_and_subsequent, temp])
    means.append(np.mean(temp["percentage_primary_and_subsequent"]))
country_order_primary_and_subsequent = np.argsort(means)[::-1]

In [7]:
# Compute interesting values as fraction and as incidence
primary_frac = []
for i, country in enumerate(countries):
    temp = data_primary[data_primary["country"] == country].drop(columns=["country","gender"])
    primary_frac.append(temp.to_numpy()[:,0])


primary_and_subsequent_frac = []
for i, country in enumerate(countries):
    temp = data_primary_and_subsequent[data_primary_and_subsequent["country"] == country].drop(columns=["country","gender"])
    primary_and_subsequent_frac.append(temp.to_numpy()[:,0])


primary_inci = []
for i, country in enumerate(countries):
    temp = infections_primary[i]
    primary_inci.append(temp / dls[i].population[0]*1e6)

primary_and_subsequent_inci = []
for i, country in enumerate(countries):
    temp = infections_primary[i] + infections_secondary[i]
    primary_and_subsequent_inci.append(temp / dls[i].population[0]*1e6)
    


## SI Table 1: Fractions

In [None]:
table = []
for i, country in enumerate(np.array(countries)[country_order_primar]):
    # Primary infections
    data  = data_primary[data_primary["country"] == country]

    CI = np.percentile(data["percentage_primary"], q = (2.5,50,97.5))
    prob_positive = np.sum(data["percentage_primary"] > 0) / data["percentage_primary"].shape[0] * 100

    # Secondary infections and primary
    data  = data_primary_and_subsequent[data_primary_and_subsequent["country"] == country]
    CI_sub = np.percentile(data["percentage_primary_and_subsequent"], q = (2.5,50,97.5))
    print(CI_sub)
    # Create table entries
    if prob_positive > 99.9:
        text_prob_pos = "$> 99.9$\%"
    else:
        text_prob_pos = f"{prob_positive:.1f}\%"

    table.append([country, f"{CI[1]:.1f}\% (CI: [{CI[0]:.1f}\%, {CI[2]:.1f}\%])", text_prob_pos,
              f"{CI_sub[1]:.1f}\% (CI: [{CI_sub[0]:.1f}\%, {CI_sub[2]:.1f}\%])"])

In [None]:
from tabulate import tabulate
headers = ['Country', 'median fraction soccer related cases', 'probability soccer increased cases', "fraction secondary infections until 31.07.2021"]
print(tabulate(table, headers, tablefmt="latex_raw"))

## SI Table 2: Totals

In [53]:
table_2 = []
for i, country in enumerate(np.array(countries)[country_order_primar]):
    
    c = country_order_primar[i]

    row = []
    
    primary_mean, primary_lower, primary_upper = np.percentile(primary_inci[c], q = (50,2.5,97.5), axis=0)
    subs_mean, subs_lower, subs_upper = np.percentile(primary_and_subsequent_inci[c], q = (50,2.5,97.5), axis=0)
    
    row.append(country)
    for m,l,u in zip(primary_mean, primary_lower, primary_upper):
        row.append(f"{m:.0f} (CI: [{l:.0f}, {u:.0f}])")
    for m,l,u in zip(subs_mean, subs_lower, subs_upper):
        row.append(f"{m:.0f} (CI: [{l:.0f}, {u:.0f}])")    
    
    table_2.append(row)

In [54]:
from tabulate import tabulate
headers = ['Country', 'Primary cases per $10^6$ inhibitants (male)', 'Primary cases per $10^6$ inhibitants (female)', "Primary and subsequent cases per $10^6$ inhibitants (male)","Primary and subsequent cases per $10^6$ inhibitants (female)"]
print(tabulate(table_2, headers, tablefmt="latex_raw"))

\begin{tabular}{lllll}
\hline
 Country     & Primary cases per $10^6$ inhibitants (male)   & Primary cases per $10^6$ inhibitants (female)   & Primary and subsequent cases per $10^6$ inhibitants (male)   & Primary and subsequent cases per $10^6$ inhibitants (female)   \\
\hline
 England     & 3091 (CI: [2174, 4092])                       & 1342 (CI: [890, 1972])                          & 10326 (CI: [7438, 12919])                                    & 8568 (CI: [6091, 11004])                                       \\
 Scotland    & 1760 (CI: [1342, 2191])                       & 405 (CI: [253, 609])                            & 9623 (CI: [7924, 11293])                                     & 7616 (CI: [6052, 9272])                                        \\
 Czechia     & 59 (CI: [9, 119])                             & 17 (CI: [2, 52])                                & 320 (CI: [45, 513])                                          & 287 (CI: [37, 466])                                          

## SI Table 3: Number of games hosted and played

In [47]:
table_3 = []

df = pd.read_csv("../../data/em_game_data.csv",header=2)
df = df[~df["id"].str.contains("a")] # Filter extra games we added for validation these are suffixed with a


country2location = {
    "GB-ENG":"London",
    "IT":"Rome",
    "AZ":"Baku",
    "DE":"Munich",
    "RU":"Saint Petersburg",
    "HU":"Budapest",
    "ES":"Seville",
    "RO":"Bucharest",
    "NL":"Amsterdam",
    "GB-SCT":"Glasgow",
    "DK":"Copenhagen"
}

for i, country in enumerate(np.array(countries)[country_order_primar]):
    c = country_order_primar[i]
    iso2 = dls[c].countries_iso2[0]
    gamesByTeam = df[df[' team1'].str.contains(iso2) | df[' team2'].str.contains(iso2)] 
    
    
    if iso2 in country2location:
        location = country2location[iso2]
        gamesHosted = df[df[' location'].str.contains(location)]
    else:
        gamesHosted = np.zeros((0,8))
    
    row = []
    row.append(country)
    row.append(gamesByTeam.shape[0])
    row.append(gamesHosted.shape[0])
    
    table_3.append(row)

In [52]:
from tabulate import tabulate
headers = ['Country', 'Matches played', 'Matches hosted']
print(tabulate(table_3, headers, tablefmt="latex_raw"))

\begin{tabular}{lrr}
\hline
 Country     &   Matches played &   Matches hosted \\
\hline
 England     &                7 &                8 \\
 Scotland    &                3 &                4 \\
 Czechia     &                5 &                0 \\
 Italy       &                7 &                4 \\
 Spain       &                6 &                4 \\
 Germany     &                4 &                4 \\
 Slovakia    &                3 &                0 \\
 Belgium     &                5 &                0 \\
 Austria     &                4 &                0 \\
 France      &                4 &                0 \\
 Portugal    &                4 &                0 \\
 Netherlands &                4 &                4 \\
\hline
\end{tabular}
