<a href="https://colab.research.google.com/github/SahputraS/Outbreak-Simulation-and-Detection-Testing/blob/main/InfoDengue_vs_Genomic_SP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install geobr
!pip install unidecode
!pip install rapidfuzz



In [10]:
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import plotly.express as px
import requests
import plotly.graph_objects as go

import time
from tqdm import tqdm

from geobr import read_municipality, read_state
import geopandas as gpd
import gc

from statsmodels.tsa.stattools import ccf
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrix
from statsmodels.gam.api import GLMGam, BSplines

from sklearn.preprocessing import StandardScaler
import seaborn as sns
from unidecode import unidecode

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import GISAID Data

In [12]:
sero1 = pd.read_csv('/content/drive/MyDrive/GISAID/gisaid_arbo_2025_09_23_14_sp.tsv', sep='\t')
location_split = sero1['Location'].str.split(' / ', expand=True)
location_split.columns = ['Continent', 'Country', 'State', 'City']
sero1 = pd.concat([sero1, location_split], axis=1)

sero1 = sero1[['Serotype', 'Genotype','Collection date', 'City']]

sero1['ym'] = pd.to_datetime(sero1['Collection date'], errors='coerce').dt.strftime('%Y-%m')
sero1.head(3)

Unnamed: 0,Serotype,Genotype,Collection date,City,ym
0,DENV2,II,2022-01-30,Piracicaba,2022-01
1,DENV2,II,2022-02-05,Piracicaba,2022-02
2,DENV2,III,2022-03-17,Tremembe,2022-03


In [13]:
# Check Nan
sero1.isna().sum()

Unnamed: 0,0
Serotype,0
Genotype,0
Collection date,0
City,353
ym,15


In [14]:
# Rename city with NaN as no-name
sero1['City'] = sero1['City'].fillna("no-name")

# Check the data without date
sero1[sero1['ym'].isna()]

Unnamed: 0,Serotype,Genotype,Collection date,City,ym
158,DENV3,Outgroup,unknown,Sao Jose do Rio Preto,
159,DENV4,Outgroup,unknown,no-name,
160,DENV4,Outgroup,unknown,no-name,
161,DENV4,Outgroup,unknown,no-name,
162,DENV4,Outgroup,unknown,no-name,
163,DENV4,Outgroup,unknown,no-name,
164,DENV4,Outgroup,unknown,no-name,
501,DENV1,V,2023,no-name,
502,DENV1,V,2023,no-name,
503,DENV1,V,2023,no-name,


In [15]:
# Drop information without month data and unknown (because aggregate data to yearly is too coarse)
sero1 = sero1.dropna(subset=['ym'])
sero1[sero1['ym'].isna()]

Unnamed: 0,Serotype,Genotype,Collection date,City,ym


In [16]:
print("the data starts from",sero1['ym'].min(), 'to', sero1['ym'].max())

the data starts from 2022-01 to 2025-07


## Get IBGE

In [17]:
def norm_name(s):
    s = str(s).strip().lower()
    s = s.replace("''", "'")
    s = s.replace(" d'oeste", " do oeste")
    s = unidecode(s)
    for ch in "-'.,":
        s = s.replace(ch, ' ')
    return ' '.join(s.split())

In [18]:
extra = {
    "santa barbara d'oeste": "santa barbara do oeste",
    "santa barbara d oeste": "santa barbara do oeste",
    "lencoes paulista": "lencois paulista",
    "lencois paulista": "lencois paulista",
    "franco da rocha": "franco da rocha",
    "mogi guacu": "mogi guacu",
    "mogi mirim": "mogi mirim",
    "guaruja": "guaruja",
    "jau": "jau",
    "itanhaem": "itanhaem",
    "ribeirao preto": "ribeirao preto",
    "sao jose dos campos": "sao jose dos campos",
    "sao jose do rio preto": "sao jose do rio preto",
    "sao caetano do sul": "sao caetano do sul",
    "santo andre": "santo andre",
    "sao paulo": "sao paulo",
    "sao luis do paraitinga": "sao luis do paraitinga",
}

In [19]:
def attach_ibge(sero_df, ref_df, city_col="City", out_col="IBGE"):

    ref_df = ref_df.copy()
    ref_df = ref_df.rename(columns=str.lower)
    ref_df["name_key"] = ref_df["name"].apply(norm_name)

    sero_df = sero_df.copy()
    sero_df["_city_norm"] = sero_df[city_col].astype(str).apply(norm_name)
    sero_df["_city_key"]  = sero_df["_city_norm"].map(extra).fillna(sero_df["_city_norm"])

    merged = sero_df.merge(ref_df[["name_key", "ibge_code", "name"]], left_on="_city_key", right_on="name_key", how="left")

    merged[out_col] = merged["ibge_code"]
    merged = merged.drop(columns=["_city_norm", "_city_key", "name_key", "ibge_code"])

    return merged

In [20]:
# take ibge data
mun = read_municipality(year=2020)
mun_sp = mun[mun['code_state'] == 35].copy()

ref_sp = mun_sp[['name_muni','code_muni']].drop_duplicates()
ref_sp = ref_sp.rename(columns={'name_muni':'name', 'code_muni':'ibge_code'})
ref_sp['ibge_code'] = ref_sp['ibge_code'].astype(int).astype(str)

ref_sp.head()

Unnamed: 0,name,ibge_code
3267,Adamantina,3500105
3268,Adolfo,3500204
3269,Aguaí,3500303
3270,Águas Da Prata,3500402
3271,Águas De Lindóia,3500501


In [21]:
sero1_ibge = attach_ibge(sero1, ref_sp)
sero1_ibge.loc[sero1_ibge['City'] == 'Sao Luis do Paraitinga', 'IBGE'] = 3550506 # Manually fix

In [22]:
sero1_ibge.head(3)

Unnamed: 0,Serotype,Genotype,Collection date,City,ym,name,IBGE
0,DENV2,II,2022-01-30,Piracicaba,2022-01,Piracicaba,3538709
1,DENV2,II,2022-02-05,Piracicaba,2022-02,Piracicaba,3538709
2,DENV2,III,2022-03-17,Tremembe,2022-03,Tremembé,3554805


In [23]:
sero1_ibge.isna().sum()

Unnamed: 0,0
Serotype,0
Genotype,0
Collection date,0
City,0
ym,0
name,342
IBGE,341


In [24]:
sero1_ibge = sero1_ibge.copy()

sero1_ibge['Collection date'] = pd.to_datetime(sero1_ibge['Collection date'], errors='coerce')
sero1_ibge = sero1_ibge.dropna(subset=['Collection date', 'Serotype'])           # keep only valid rows
sero1_ibge['month'] = sero1_ibge['Collection date'].values.astype('datetime64[M]')

sero_m = (sero1_ibge.groupby(['month','Serotype'], as_index=False)
                       .size()
                       .rename(columns={'size':'n'}))

sero_mw = (sero_m.pivot(index='month', columns='Serotype', values='n')
                   .fillna(0)
                   .astype(int)
                   .sort_index())

sero_mw.head(3)

Serotype,DENV1,DENV2,DENV3,DENV4
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-01,13,1,0,0
2022-02-01,58,1,0,0
2022-03-01,115,11,0,0


In [25]:
sero_mw.tail(3)

Serotype,DENV1,DENV2,DENV3,DENV4
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-05-01,5,10,4,0
2025-06-01,0,20,6,0
2025-07-01,0,11,1,0


## Import Dengue Data

In [26]:
def data_donwload(geocode, y_start, y_end):
  url = "https://info.dengue.mat.br/api/alertcity"
  geocode = geocode
  disease = "dengue"
  format = "csv"
  ew_start = 1
  ew_end = 53
  ey_start = y_start
  ey_end = y_end

  params =(
      "&disease="
      + f"{disease}"
      + "&geocode="
      + f"{geocode}"
      + "&disease="
      + f"{disease}"
      + "&format="
      + f"{format}"
      + "&ew_start="
      + f"{ew_start}"
      + "&ew_end="
      + f"{ew_end}"
      + "&ey_start="
      + f"{ey_start}"
      + "&ey_end="
      + f"{ey_end}"
  )

  url_resp = "?".join([url, params])

  dados = pd.read_csv(url_resp, index_col='SE')
  dados['ibge'] = geocode
  return dados

In [27]:
downdload_data = False # Manual switch to download data
if downdload_data == True:
  all_data = []
  y_start= 2021
  y_end = 2025
  for geocode in tqdm(codes_sp, desc="Downloading"):
      try:
          df = data_donwload(geocode, y_start=y_start, y_end=y_end)
          df['ibge'] = geocode
          all_data.append(df)
      except Exception as e:
          print(f"Failed for {geocode}: {e}")
      time.sleep(0.1)

  data_sp = pd.concat(all_data)
  data_sp.head()

  del all_data
  gc.collect()

  # Save in my drive so i don't need to redownload
  data_sp2 = data_sp[['data_iniSE', 'ibge', 'nivel', 'casos', 'pop']]
  save_path = "/content/drive/MyDrive/GISAID/data_sp_incidence.csv"
  data_sp2.to_csv(save_path, index=False)
  print(f"File saved to: {save_path}")

else:
  data_sp2 = pd.read_csv('/content/drive/MyDrive/GISAID/data_sp_incidence.csv')
  data_sp2 = data_sp2[['data_iniSE', 'ibge', 'nivel', 'casos', 'pop']]

data_sp2.head(5)

Unnamed: 0,data_iniSE,ibge,nivel,casos,pop
0,2025-09-07,3500105,1,15,34357.0
1,2025-08-31,3500105,1,22,34357.0
2,2025-08-24,3500105,1,10,34357.0
3,2025-08-17,3500105,1,12,34357.0
4,2025-08-10,3500105,1,7,34357.0


In [28]:
data_sp2['nivel'].unique()

array([1, 4, 2, 3])

In [29]:
# Take the municipality that is on the sero data
ibge_sp_target = sero1_ibge['IBGE'].unique()

ibge_int = (pd.to_numeric(pd.Series(ibge_sp_target), errors='coerce')
              .dropna()
              .astype(int)
              .to_numpy())
len(ibge_int)

186

In [30]:
inc_sp = data_sp2[data_sp2['ibge'].isin(ibge_int)]
inc_sp.tail()

Unnamed: 0,data_iniSE,ibge,nivel,casos,pop
158020,2021-01-31,3557303,2,1,11407.0
158021,2021-01-24,3557303,1,1,11407.0
158022,2021-01-17,3557303,1,1,11407.0
158023,2021-01-10,3557303,1,0,11407.0
158024,2021-01-03,3557303,1,0,11407.0


In [31]:
len(inc_sp['ibge'].unique())

186

In [32]:
print("the data starts from",inc_sp['data_iniSE'].min(), 'to', inc_sp['data_iniSE'].max())

the data starts from 2021-01-03 to 2025-09-07


In [33]:
# Take the data from 2022-01 and 2025-07
inc_sp = inc_sp[(inc_sp['data_iniSE'] >= '2022-01-01') & (inc_sp['data_iniSE'] < '2025-08-01')]
print("the data starts from",inc_sp['data_iniSE'].min(), 'to', inc_sp['data_iniSE'].max())
inc_sp.tail()

the data starts from 2022-01-02 to 2025-07-27


Unnamed: 0,data_iniSE,ibge,nivel,casos,pop
157968,2022-01-30,3557303,1,0,11407.0
157969,2022-01-23,3557303,1,0,11407.0
157970,2022-01-16,3557303,1,0,11407.0
157971,2022-01-09,3557303,1,1,11407.0
157972,2022-01-02,3557303,1,1,11407.0


### Aggregate Info Dengue

In [34]:
# Simple method
inc_sp = inc_sp.copy()
inc_sp['month'] = pd.to_datetime(inc_sp['data_iniSE']).values.astype('datetime64[M]')

inc_sp = inc_sp.copy()

inc_sp['nivel2'] = inc_sp['nivel'].astype('object')

m1 = (inc_sp['nivel'] == 1) & (inc_sp['casos'] == 0)   # green & zero cases
m2 = (inc_sp['nivel'] == 1) & (inc_sp['casos']  > 0)   # green & some cases

inc_sp.loc[m1, 'nivel2'] = 'non_case'
inc_sp.loc[m2, 'nivel2'] = 'endemic'

nivel_c = (inc_sp.groupby(['month','nivel2'], as_index=False)
                 .size()
                 .rename(columns={'size':'n_rows'}))

nivel_cw = nivel_c.pivot(index='month', columns='nivel2', values='n_rows').fillna(0).astype(int)
nivel_cw['alarm'] = nivel_cw[2]+ nivel_cw[3]+nivel_cw[4] # Combine the 2,3,4 alert level into just 'alarm'
nivel_cw.drop(columns=[2,3,4], inplace=True)

nivel_cw = nivel_cw.copy()
nivel_cw.index = pd.to_datetime(nivel_cw.index)

# Convert to percentage
nivel_perc = nivel_cw.div(nivel_cw.sum(axis=1), axis=0).mul(100)
nivel_perc.tail(3)

nivel2,endemic,non_case,alarm
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-05-01,29.704301,4.301075,65.994624
2025-06-01,60.645161,12.580645,26.774194
2025-07-01,74.865591,22.177419,2.956989


In [35]:
nivel_perc.head(3)

nivel2,endemic,non_case,alarm
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,46.344086,24.193548,29.462366
2022-02-01,42.069892,20.16129,37.768817
2022-03-01,33.198925,7.795699,59.005376


In [36]:
## Recheck if the sum is 100

row_sums = nivel_perc.sum(axis=1, skipna=True)
ok = np.isclose(row_sums.to_numpy(), 100.0, atol=1e-6)
ok_all = bool(np.all(ok))
ok_all

True

In [37]:
nivel_perc_plot = nivel_perc.copy()
nivel_perc_plot.columns = nivel_perc_plot.columns.map(str)
nivel_perc_plot['month'] = nivel_perc_plot.index
df_long = nivel_perc_plot.melt(id_vars='month', var_name='nivel', value_name='percent')

fig = px.area(
    df_long,
    x='month', y='percent', color='nivel',
    category_orders={'nivel': ['endemic','non_case','2','3','4']},
    labels={'percent':'Proportion (%)', 'month':'Month', 'nivel':'nivel'},
    title='Monthly proportion of the labels'
)

fig.update_layout(
    template='plotly_white',
    hovermode='x unified',

    title=dict(text='Monthly proportion of the labels', font=dict(size=28)),
    font=dict(size=16),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='left', x=0,
                title_text='nivel', font=dict(size=18)),
    hoverlabel=dict(font_size=14))


fig.update_xaxes(title_text='Month',
                 title_font=dict(size=20),
                 tickfont=dict(size=18))
fig.update_yaxes(title_text='Proportion (%)',
                 title_font=dict(size=20),
                 tickfont=dict(size=18),
                 range=[0,100], ticksuffix='%')

fig.show()



## Import the Epi-Quark

In [38]:
try:
    from epiquark import conf_matrix, score, timeliness
except ImportError:
    import sys
    !{sys.executable} -m pip install git+https://github.com/aauss/epi-quark.git
    from epiquark import conf_matrix, score, timeliness

Collecting git+https://github.com/aauss/epi-quark.git
  Cloning https://github.com/aauss/epi-quark.git to /tmp/pip-req-build-_wo3lfee
  Running command git clone --filter=blob:none --quiet https://github.com/aauss/epi-quark.git /tmp/pip-req-build-_wo3lfee
  Resolved https://github.com/aauss/epi-quark.git to commit 9e6ca56145f63730b2a62dc2b93191f9884e3c39
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: epi-quark
  Building wheel for epi-quark (pyproject.toml) ... [?25l[?25hdone
  Created wheel for epi-quark: filename=epi_quark-0.1.0-py3-none-any.whl size=13005 sha256=6674f761ea3fff7a92dec640cb8d911b62ee2914a8674a0ed4b81399b9b6c606
  Stored in directory: /tmp/pip-ephem-wheel-cache-oeung1st/wheels/ca/e2/d7/40d6b6b9b7873b820dbb0f3833f59fa5fac00f237710b2069b
Successfully built epi-quark
Installing collected packages: epi-quark
S

Reformat the serotype data

In [39]:
sero_mw['endemic'] = 0 # Add endemic category
sero_mw.tail(3)

Serotype,DENV1,DENV2,DENV3,DENV4,endemic
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-05-01,5,10,4,0,0
2025-06-01,0,20,6,0,0
2025-07-01,0,11,1,0,0


In [40]:
# Check if the time step are complete (all months exists)
full = pd.date_range(sero_mw.index.min(), sero_mw.index.max(), freq='MS')

missing_months = full.difference(sero_mw.index)
is_complete = len(missing_months) == 0

print(f"Complete monthly series: {is_complete}")
if not is_complete:
    print("Missing months:", missing_months.strftime('%Y-%m').tolist())


Complete monthly series: False
Missing months: ['2022-09', '2022-10', '2022-11', '2022-12', '2024-08', '2024-09']


In [41]:
sero_mw2 = sero_mw.reindex(full, fill_value=0)
sero_mw2.index.name = 'month'

In [42]:
sero_ml = (sero_mw2.reset_index()  # bring month out of the index, and make into long format
                    .melt(id_vars='month',
                          var_name='Serotype',
                          value_name='value')
                    .sort_values(['month','Serotype'])
                    .reset_index(drop=True))

sero_ml.rename(columns={'Serotype': 'data_label'}, inplace=True)

sero_ml.tail()

Unnamed: 0,month,data_label,value
210,2025-07-01,DENV1,0
211,2025-07-01,DENV2,11
212,2025-07-01,DENV3,1
213,2025-07-01,DENV4,0
214,2025-07-01,endemic,0


Reformat the signal data

In [43]:
# Check if the time step are complete (all months exists)
nivel_perc.index= pd.to_datetime(nivel_perc.index)
full = pd.date_range(nivel_perc.index.min(), nivel_perc.index.max(), freq='MS')

missing_months = full.difference(nivel_perc.index)
is_complete = len(missing_months) == 0

print(f"Complete monthly series: {is_complete}")
if not is_complete:
    print("Missing months:", missing_months.strftime('%Y-%m').tolist())

Complete monthly series: True


In [44]:
nivel_ml = (nivel_perc.reset_index()  # bring month out of the index and make to long format
                    .melt(id_vars='month',
                          var_name='nivel2',
                          value_name='value')
                    .sort_values(['month','nivel2'])
                    .reset_index(drop=True))

nivel_ml.rename(columns={'nivel2': 'signal_label'}, inplace=True)
nivel_ml['value'] = nivel_ml['value']/100 # Make to decimal

nivel_ml.tail(6)

Unnamed: 0,month,signal_label,value
123,2025-06-01,alarm,0.267742
124,2025-06-01,endemic,0.606452
125,2025-06-01,non_case,0.125806
126,2025-07-01,alarm,0.02957
127,2025-07-01,endemic,0.748656
128,2025-07-01,non_case,0.221774


**Epi-Quark**

In [45]:
cases = sero_ml
signals = nivel_ml
dl = sero_ml['data_label'].unique()
disease_list = np.concatenate([dl, ['non_case']])

In [46]:
cases.head(5)

Unnamed: 0,month,data_label,value
0,2022-01-01,DENV1,13
1,2022-01-01,DENV2,1
2,2022-01-01,DENV3,0
3,2022-01-01,DENV4,0
4,2022-01-01,endemic,0


In [47]:
signals.head(6)

Unnamed: 0,month,signal_label,value
0,2022-01-01,alarm,0.294624
1,2022-01-01,endemic,0.463441
2,2022-01-01,non_case,0.241935
3,2022-02-01,alarm,0.377688
4,2022-02-01,endemic,0.420699
5,2022-02-01,non_case,0.201613


In [48]:
metrics_epi_quark = {
    "precision": score(cases, signals, "precision", 0.2, 0.2),
    "recall": score(cases, signals, "recall", 0.2, 0.2),
    "f1": score(cases, signals, "f1", 0.2, 0.2),
}

epi_quark_df = pd.DataFrame(metrics_epi_quark, index=disease_list)
epi_quark_df = epi_quark_df.round(2)
epi_quark_df


The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.


The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.


invalid value encountered in scalar divide


invalid value encountered in scalar divide


The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.



Unnamed: 0,precision,recall,f1
DENV1,0.75,0.1,0.17
DENV2,0.75,0.18,0.29
DENV3,0.25,0.17,0.2
DENV4,0.0,,0.0
endemic,0.0,,0.0
non_case,0.29,0.67,0.4


**In the example case where there is no non-case (non_case label has 0 occurences in cases and signal dataframes)**

In [82]:
ser_cols = ['DENV1','DENV2','DENV3','DENV4', 'endemic']

df = sero_mw2.copy()
mask = (df[ser_cols] == 0)

# Fill random number between 1 to 10 on where it is 0s on the denv 1,2,3,4 and endemic (therefore non_case is 0)
rng = np.random.default_rng(22)
rand_vals = pd.DataFrame(
    rng.integers(1, 10, size=df[ser_cols].shape),
    index=df.index, columns=ser_cols
)
df.loc[:, ser_cols] = df[ser_cols].where(~mask, rand_vals)

df.head(3)

Serotype,DENV1,DENV2,DENV3,DENV4,endemic
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01,13,1,6,2,9
2022-02-01,58,1,6,1,5
2022-03-01,115,11,3,8,3


In [83]:
test_df = (df.reset_index()  # bring month out of the index, and make into long format
                    .melt(id_vars='month',
                          var_name='Serotype',
                          value_name='value')
                    .sort_values(['month','Serotype'])
                    .reset_index(drop=True))

test_df.rename(columns={'Serotype': 'data_label'}, inplace=True)
test_df.tail()

Unnamed: 0,month,data_label,value
210,2025-07-01,DENV1,4
211,2025-07-01,DENV2,11
212,2025-07-01,DENV3,1
213,2025-07-01,DENV4,5
214,2025-07-01,endemic,2


In [84]:
sig_test = nivel_perc.copy()
# Make another alarm label for a place holder of original non_case values
# This is to keep the total is 100%

sig_test['alarm1'] = sig_test['non_case']
sig_test['non_case'] = 0 #therefore non_case is 0
sig_test.head(3)

nivel2,endemic,non_case,alarm,alarm1
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-01,46.344086,0,29.462366,24.193548
2022-02-01,42.069892,0,37.768817,20.16129
2022-03-01,33.198925,0,59.005376,7.795699


In [85]:
sig_test2 = (sig_test.reset_index()  # bring month out of the index and make to long format
                    .melt(id_vars='month',
                          var_name='nivel2',
                          value_name='value')
                    .sort_values(['month','nivel2'])
                    .reset_index(drop=True))

sig_test2.rename(columns={'nivel2': 'signal_label'}, inplace=True)
sig_test2['value'] = sig_test2['value']/100 # Make to decimal

sig_test2.tail(6)

Unnamed: 0,month,signal_label,value
166,2025-06-01,endemic,0.606452
167,2025-06-01,non_case,0.0
168,2025-07-01,alarm,0.02957
169,2025-07-01,alarm1,0.221774
170,2025-07-01,endemic,0.748656
171,2025-07-01,non_case,0.0


In [86]:
cases2 = test_df
signals2 = sig_test2
dl2 = cases2['data_label'].unique()
disease_list = np.concatenate([dl2, ['non_case']])

In [87]:
print('Check if cases dataframe contains all non-zero values (therefore non_case is 0) : ', np.all(cases2['value'] > 0))
print('Check if signal data, label "non_case", contains all zero values : ', np.all((signals2[signals2['signal_label'] == 'non_case']['value']) == 0.0))

Check if cases dataframe contains all non-zero values (therefore non_case is 0) :  True
Check if signal data, label "non_case", contains all zero values :  True


In [88]:
# Data from Jan-22 to July-25, therefore 43 months
# The cases dataframe should have 5 labels (i.e. DENV1,2,3,4, endemic)
# Therefore should exists 5 x 43 data points

print('The length of cases dataset is as expected:', len(cases2)==5*43)

# Data from Jan-22 to July-25, therefore 43 months
# The cases dataframe should have 4 labels (i.e. alarm, alarm1 endemic, non_case)
# Therefore should exists 4 x 43 data points

print('The length of cases dataset is as expected:', len(signals2)==4*43)

The length of cases dataset is as expected: True
The length of cases dataset is as expected: True


In [89]:
# When both signal and cases have non-zero of non-case, the evaluator crash
metrics_epi_quark2 = {
    "precision": score(cases2, signals2, "precision", 0.2, 0.2),
    "recall": score(cases2, signals2, "recall", 0.2, 0.2),
    "f1": score(cases2, signals2, "f1", 0.2, 0.2),
}

epi_quark_df2 = pd.DataFrame(metrics_epi_quark2, index=disease_list2)
epi_quark_df2 = epi_quark_df2.round(2)
epi_quark_df2


The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



ValueError: not enough values to unpack (expected 4, got 1)