<a href="https://colab.research.google.com/github/Uli-Z/ESICMDatathon2026/blob/main/ExtractingData_20260114.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-requisites for Amsterdam UMC DB


In [3]:
# sets *your* project id
PROJECT_ID = "datathon-484308" #@param {type:"string"}


In [4]:
# sets default dataset for AmsterdamUMCdb
DATASET_PROJECT_ID = 'amsterdamumcdb' #@param {type:"string"}
DATASET_ID = 'van_gogh_2026_datathon' #@param {type:"string"}
LOCATION = 'eu' #@param {type:"string"}

In [5]:
import os
from google.colab import auth

# all libraries check this environment variable, so set it:
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

auth.authenticate_user()
print('Authenticated')


Authenticated


In [6]:
%load_ext google.colab.data_table
from google.colab.data_table import DataTable

# change default limits:
DataTable.max_columns = 50
DataTable.max_rows = 50000


In [7]:
from google.cloud.bigquery import magics
from google.cloud import bigquery

# sets the default query job configuration
def_config = bigquery.job.QueryJobConfig(default_dataset=DATASET_PROJECT_ID + "." + DATASET_ID)
magics.context.default_query_job_config = def_config


In [8]:
import pandas as pd
import numpy as np

import matplotlib as plt
import seaborn as sns
sns.set_style('darkgrid')

# all possible ventilator patients
this will be used as criteria of person ids for all subsequent extractions

In [None]:
%%bigquery ventpatlist --project $PROJECT_ID
select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'

# Death

In [None]:
%%bigquery died --project $PROJECT_ID
with vlist as (select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
)
select person_id, death_datetime
  from death
  where person_id in (select person_id from vlist)
order by person_id, death_datetime

In [None]:
died

# Age
there is only year of birth. need to calculate later.

In [None]:
%%bigquery age --project $PROJECT_ID
with vlist as (select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
)
select person_id, gender_concept_id, year_of_birth from person
where person_id in (select person_id from vlist)


In [None]:
age.describe()

# BMI

In [None]:
%%bigquery BMI --project $PROJECT_ID
with vlist as (select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
)
select person_id, measurement_datetime, value_as_number as bmi
from measurement
where person_id in (select person_id from vlist)
  and measurement_concept_id in (
    3038553  -- Body mass index (BMI) [Ratio]
  )
order by person_id, measurement_datetime

In [None]:
BMI

# Diagnosis: surgical or non surgical
All entries in condition occurrence are admission diagnosis. They are uncoded, so free text search is necessary.


In [None]:
%%bigquery surg --project esicmdatathon2026
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
), surg as (
select person_id, condition_start_datetime, condition_source_value
, case when lower(condition_source_value) like '%non%surg%' then 'non-surgical' else 'surgical' end as condition_type
from condition_occurrence
where condition_source_value like '%surg%' --matches surgical and non surgical
  and person_id in (select person_id from vlist)
)
select * from surg
PIVOT ( count(*) for condition_type in ('surgical', 'non-surgical'))
order by person_id, condition_start_datetime

In [None]:
surg.head(20)

In [None]:
top_10_conditions = surg.groupby('condition_source_value')['person_id'].nunique().nlargest(10)
print(top_10_conditions)

# Heart rate, SpO2, Temp, GCS

In [None]:
%%bigquery hf_ekg --project esicmdatathon2026
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
), hf as (
select person_id, measurement_datetime
, case when measurement_concept_id = 21490872 then 'hf_ekg'
       when measurement_concept_id = 40762499 then 'SpO2'
       when measurement_concept_id = 3020891 then 'temp'
       when measurement_concept_id = 3007194 then 'GCS'
       when measurement_concept_id = 2000000016 then 'RASS'
       when measurement_concept_id = 21490712 then 'LVCI'
       when measurement_concept_id = 3005555 then 'LVCO'
       end as measurement_type
, value_as_number
from measurement m
where person_id in (select person_id from vlist)
  and measurement_concept_id in (
	21490872 --Heart rate.beat-to-beat by EKG
  , 40762499 -- Oxygen saturation in Arterial blood by Pulse oximetry
  , 3020891 -- Body temperature
  , 3007194  -- Glasgow coma score total
  , 2000000016 -- Richmond agitation-sedation scale score
  , 21490712 -- Left ventricular Cardiac index
  , 3005555 -- Left ventricular Cardiac output
)
)
select * from hf
PIVOT ( max(value_as_number) for measurement_type in ('hf_ekg', 'SpO2', 'temp', 'GCS', 'RASS', 'LVCI', 'LVCO'))
order by person_id, measurement_datetime

In [None]:
hf_ekg[hf_ekg['person_id']==65396]  # random check


# Arterial blood pressure

In [None]:
%%bigquery bp --project $PROJECT_ID
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
)
, bp as (
select person_id, measurement_datetime, value_as_number
, case when measurement_concept_id = 21490853 then 'ABPs'
     when measurement_concept_id = 21490851 then 'ABPd'
     when measurement_concept_id = 21490852 then 'ABPm'
     when measurement_concept_id = 3000333 then 'CVPm'
     end as measure_type
from measurement m
where person_id in (select person_id from vlist)
  and measurement_concept_id in ( 21490851 -- ABP d
  , 21490852 -- ABP m
  , 21490853 -- ABPs
  , 3000333 -- CVP m
)
)
select * from bp
PIVOT ( max(value_as_number) for measure_type in ('ABPs', 'ABPd', 'ABPm', 'CVPm'))
order by person_id, measurement_datetime

In [None]:
bp[bp['CVPm']>0]  # plausibility checks

# Breath rate or respiratory rate

Breath rate from ventilator and respiratory rate from monitor gets mixed up over the years of data recording.

In [None]:
%%bigquery rr --project esicmdatathon2026
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
)
, rr as (
select person_id, measurement_datetime, value_as_number
, case when measurement_concept_id = 3043148 then 'BRvent'
     when measurement_concept_id = 3026892 then 'BRspontv'
     when measurement_concept_id = 1175625 then 'BRspont'
     when measurement_concept_id = 3024171 then 'RR'
     end as measure_type
from measurement m
where person_id in (select person_id from vlist)
  and measurement_concept_id in ( 3043148 --Breath rate mechanical
                                 , 3026892 -- Breath rate spontaneous --on ventilator
                                 , 1175625 -- Breath rate spontaneous
                                 , 3024171 -- Respiratory rate
                                 )
)
select * from rr
PIVOT ( max(value_as_number) for measure_type in ('BRvent', 'BRspontv','BRspont', 'RR'))
order by person_id, measurement_datetime

In [None]:
# rr[rr['BRvent']>90] # plausibility check
# rr[rr['person_id']==23444] # testing purpose, random person

# rr[rr['BRspont']>90] # plausibility check
rr[rr['person_id']==39444] # testing purpose, random person


# BGA

In [None]:
%%bigquery bga --project $PROJECT_ID
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
), br as ( select person_id, measurement_datetime
  , case when measurement_concept_id = 3010421 then 'pH'
     when measurement_concept_id in (3027315, 3027801) then 'PaO2'
		 when measurement_concept_id in (3027946, 3013290) then 'PaCO2'
		 when measurement_concept_id in (3006576, 3008152) then 'HCO3'
		 when measurement_concept_id in (3012501, 3003396) then 'BE'
     when measurement_concept_id in (3024928, 1616373) then 'SvO2'
       end as concept_name
  , value_as_number
  from measurement m
  where measurement_concept_id in (
         3010421 -- pH of Blood
	      ,3027315 -- Oxygen [Partial pressure] in Blood
        ,3027801 -- Oxygen [Partial pressure] in Arterial blood
				,3013290 -- Carbon dioxide [Partial pressure] in Blood
        ,3027946 -- Carbon dioxide [Partial pressure] in Arterial blood
				,3006576 -- Bicarbonate [Moles/volume] in Blood
        ,3008152 -- Bicarbonate [Moles/volume] in Arterial blood
				,3012501 -- Base excess in Blood by calculation
        ,3003396 -- Base excess in Arterial blood by calculation
				,3024928 -- Oxygen saturation in Venous blood
        ,1616373 -- Oxygen saturation in Central venous blood
  )
  and value_as_number is not null
  and (measurement_concept_id not in (3027315, 3027801) or
        -- there are PaO2 entries that are converted to mmHg, but more values are in kPA
        -- ignoring those that are mmHg ?
       ( measurement_concept_id in (3027315, 3027801) and unit_source_value = 'kPa') )
  and person_id in (select person_id from vlist)
  )
  select * from br
  PIVOT (max(value_as_number) for concept_name in ('pH', 'PaO2', 'PaCO2', 'HCO3','BE','SvO2'))
  order by person_id, measurement_datetime

In [None]:
bga.describe()

# PEEP, TV, MV, Lung compliance, Resistance

In [None]:
%%bigquery peep --project $PROJECT_ID
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
),
 vent as
( select person_id, measurement_datetime
  , case when measurement_concept_id = 3012410 then 'TV'
       when measurement_concept_id = 36303816 then 'TVinsp'
       when measurement_concept_id = 2000000222 then 'MVspont'
       when measurement_concept_id = 3045410 then 'MVset'
       when measurement_concept_id in (3022875, 3016226) then 'PEEP'
       when measurement_concept_id = 3035822 then 'IntrinsicPEEP'
       when measurement_concept_id = 2000000211 then 'IPEEP'
       when measurement_concept_id = 21490581 then 'COMPL'
       when measurement_concept_id = 21490740 then 'RESIS'
       end as concept_name
  , value_as_number
  from measurement m
  where measurement_concept_id in (
        3012410 -- Tidal volume setting Ventilator
        , 36303816 -- Tidal volume.inspired
        , 3022875 -- Positive end expiratory pressure setting Ventilator
        , 3016226 -- PEEP Respiratory system
        , 2000000222 -- Minute volume spontaneous
        , 3045410 -- Minute volume setting Ventilator
        , 3035822 -- Intrinsic PEEP Respiratory system
        , 2000000211 -- Inspiratory Pressure Above PEEP
        , 21490581 -- Lung compliance
        , 21490740 -- Airway resistance
  )
  and value_as_number is not null
  and person_id in (select person_id from vlist)
)
select * from vent
  PIVOT (max(value_as_number) for concept_name in ('MVspont','MVset', 'TV', 'TVinsp'
            ,'PEEP', 'IntrinsicPEEP', 'IPEEP', 'COMPL', 'RESIS'))
order by person_id, measurement_datetime

In [None]:
peep[peep['RESIS']>0]   # random eyeball check


# FiO2

FiO2 exists for NIV too.

In [None]:
%%bigquery FiO2 --project esicmdatathon2026
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
),
fio2 as (
select person_id, measurement_datetime
, case when measurement_concept_id in ( 3025408, 2000000204) then 'FiO2'
       when measurement_concept_id = 2000000203 then 'FiO2niv'
       end as concept_name
, value_as_number
from measurement m
where measurement_concept_id in (2000000203 -- FiO2 NIV
  , 3025408 -- Oxygen/Inspired gas Respiratory system by O2 Analyzer --on ventilator
  --, 2000000204 -- fio2 setting
)
  and person_id in (select person_id from vlist)
  and value_as_number > 0
)
select * from fio2
PIVOT (max(value_as_number) for concept_name in ('FiO2', 'FiO2niv'))
order by person_id, measurement_datetime

In [None]:
FiO2

# Labs: creatinine, bilirubin, hemoglobin and co.


In [None]:
%%bigquery lab --project $PROJECT_ID
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
),
c as (
select person_id, measurement_datetime
, case when measurement_concept_id = 3020564 then 'Cr'
       when measurement_concept_id in (3043995, 3005772) then 'Br'
       when measurement_concept_id = 40762351 then 'Hb'
       when measurement_concept_id = 3007461 then 'Plt'
       when measurement_concept_id = 3010813 then 'wbc'
       when measurement_concept_id in (3000285, 3019550) then 'Na'
       When measurement_concept_id = 3020779 then 'Urea'
       when measurement_concept_id = 3024561 then 'Alb'
       when measurement_concept_id = 3009542 then 'Hct'
       when measurement_concept_id = 3005456 then 'K'
       when measurement_concept_id in (3047181, 3018405) then 'Lactate'
       end as concept_name
, value_as_number
from measurement m
where measurement_concept_id in (
   3020564 -- Creatinine [Moles/volume] in Serum or Plasma
  , 3005772 -- Bilirubin.conjugated [Moles/volume] in Serum or Plasma
  , 3043995 -- Bilirubin.conjugated+indirect [Moles/volume] in Serum or Plasma
  , 40762351 -- Hemoglobin [Moles/volume] in Blood
  , 3007461 -- Platelets [#/volume] in Blood
  , 3010813 -- Leukocytes [#/volume] in Blood
  , 3000285 -- Sodium [Moles/volume] in Blood
  , 3019550 -- Sodium [Moles/volume] in Serum or Plasma
  , 3020779 --Urea [Moles/volume] in Serum or Plasma
  , 3024561 -- Albumin [Mass/volume] in Serum or Plasma
  , 3009542 -- Hematocrit [Volume Fraction] of Blood by calculation
  , 3005456 -- Potassium [Moles/volume] in Blood
  , 3047181 -- Lactate [Moles/volume] in Blood
  , 3018405 -- Lactate [Moles/volume] in Arterial blood
)
  and person_id in (select person_id from vlist)
  and value_as_number > 0
)
select * from c
PIVOT (max(value_as_number) for concept_name in ('Cr', 'Br', 'Hb', 'Plt', 'wbc', 'Na'
        , 'Urea', 'Alb', 'Hct', 'K', 'Lactate'))
order by person_id, measurement_datetime


In [None]:
lab.describe()

# Vasopressors



In [None]:
%%bigquery vp --project $PROJECT_ID
with vlist as (
  select distinct person_id
from measurement m inner join concept c on m.measurement_concept_id = c.concept_id
where concept_name like '%ventila%'
),
vp as (
select person_id, drug_exposure_start_datetime, drug_exposure_end_datetime
  , case when drug_concept_id in (19006871) then 'argipressin'
       when drug_concept_id in (1321341 ) then 'norepinephrine'
       when drug_concept_id in (1337720) then 'dobutamine'
       when drug_concept_id in (1337860) then 'dopamine'
       end as concept_name
, quantity
from drug_exposure
where drug_concept_id in (19006871 -- argipressin
    , 1321341 -- norepinephrine
    , 1337720 -- dobutamine
    , 1337860 -- dopamine
)
  and person_id in (select person_id from vlist)
)
select * from vp
PIVOT (max(quantity) for concept_name in ('argipressin', 'norepinephrine', 'dobutamine', 'dopamine'))
order by person_id, drug_exposure_start_datetime, drug_exposure_end_datetime

In [None]:
vp

# Fluid balance

In [9]:
import pandas as pd
import gc

dataset_ref = f"{DATASET_PROJECT_ID}.{DATASET_ID}"
client = bigquery.Client(project=PROJECT_ID)
job_config = bigquery.QueryJobConfig(default_dataset=dataset_ref)

# --- 1. Parameters & Configuration ---
# Set dates to filter data for testing (Format: YYYY-MM-DD).
# Leave empty string "" to fetch ALL data (Warning: High Memory Usage).
START_DATE = "" #@param {type:"string"}
END_DATE = ""   #@param {type:"string"}

FLUID_CONFIG = {
    3037253: ('Input',  'In: IV'),
    3010494: ('Input',  'In: Enteral'),
    3006552: ('Input',  'In: Oral'),
    3014315: ('Output', 'Out: Urine'),
    3011087: ('Output', 'Out: Stool'),
    3026556: ('Output', 'Out: Chest Tube'),
    3018767: ('Output', 'Out: Ventricle'),
    21491183: ('Output', 'Out: GI Drain'),
    3020433: ('Output', 'Out: Misc')
}

fluid_ids_list = ", ".join(map(str, FLUID_CONFIG.keys()))

# --- 2. Dynamic SQL Construction ---
date_filter_sql = ""
if START_DATE and END_DATE:
    date_filter_sql = f"""
      AND m.measurement_datetime >= '{START_DATE}'
      AND m.measurement_datetime < '{END_DATE}'
    """
    print(f"Applying date filter: {START_DATE} to {END_DATE}")
else:
    print("WARNING: No date filter set. Fetching ALL data.")

query_text = f"""
WITH vlist AS (
  SELECT DISTINCT person_id
  FROM measurement
  JOIN concept ON measurement_concept_id = concept_id
  WHERE concept_name LIKE '%ventila%'
)
SELECT
  m.person_id,
  v.visit_occurrence_id,
  m.measurement_datetime,
  m.measurement_concept_id,
  CAST(m.value_as_number AS INT64) as value_ml
FROM measurement m
JOIN visit_occurrence v
  ON m.person_id = v.person_id
  AND m.measurement_datetime >= v.visit_start_datetime
  AND (v.visit_end_datetime IS NULL OR m.measurement_datetime <= v.visit_end_datetime)
WHERE m.measurement_concept_id IN ({fluid_ids_list})
  AND m.value_as_number IS NOT NULL
  AND m.person_id IN (SELECT person_id FROM vlist)
  {date_filter_sql} -- Insert dynamic date filter here
"""

print("Fetching raw fluid data...")
fluid_raw = client.query(query_text, job_config=job_config).to_dataframe()

Fetching raw fluid data...


In [1]:
# --- 3. Safety Check & Processing ---
if fluid_raw.empty:
    print("WARNING: No fluid data found for this period/cohort.")
    fluid_balance = pd.DataFrame()
else:
    df_f = fluid_raw.copy()
    del fluid_raw
    gc.collect()

    df_f['measurement_datetime'] = pd.to_datetime(df_f['measurement_datetime'])

    # 07:00 AM Logic
    df_f['report_date'] = (df_f['measurement_datetime'] - pd.Timedelta(hours=7)).dt.date + pd.Timedelta(days=1)
    df_f['label'] = df_f['measurement_concept_id'].map(lambda x: FLUID_CONFIG[x][1]).astype('category')

    # Aggregation
    fluid_balance = df_f.pivot_table(
        index=['person_id', 'visit_occurrence_id', 'report_date'],
        columns='label',
        values='value_ml',
        aggfunc='sum',
        fill_value=0
    ).reset_index()

    del df_f
    gc.collect()

    # Totals & Cleanup
    in_cols  = [lbl for _, (cat, lbl) in FLUID_CONFIG.items() if cat == 'Input' and lbl in fluid_balance.columns]
    out_cols = [lbl for _, (cat, lbl) in FLUID_CONFIG.items() if cat == 'Output' and lbl in fluid_balance.columns]

    fluid_balance['total_input']   = fluid_balance[in_cols].sum(axis=1).astype('int32')
    fluid_balance['total_output']  = fluid_balance[out_cols].sum(axis=1).astype('int32')
    fluid_balance['daily_balance'] = (fluid_balance['total_input'] - fluid_balance['total_output']).astype('int32')

    fluid_balance['cumulative_balance'] = (
        fluid_balance.groupby(['person_id', 'visit_occurrence_id'])['daily_balance'].cumsum().astype('int32')
    )

    fluid_balance.drop(columns=in_cols + out_cols, inplace=True)
    gc.collect()

    print(f"Success. Calculated balances for {fluid_balance['visit_occurrence_id'].nunique()} visits.")
    display(fluid_balance.head())

NameError: name 'fluid_raw' is not defined

In [17]:
# @title Fluid Balance Extraction (Daily Aggregation)
import pandas as pd
from google.cloud import bigquery

# --- 1. Client & Dataset Setup ---
# Initialize BigQuery client using project and dataset variables defined globally
client = bigquery.Client(project=PROJECT_ID)
dataset_ref = f"{DATASET_PROJECT_ID}.{DATASET_ID}"
job_config = bigquery.QueryJobConfig(default_dataset=dataset_ref)

print(f"Fluid Balance: Client configured for dataset '{dataset_ref}'")


# Concept ID Configuration
# Define which concepts constitute Input (Intake) vs. Output (Losses)
INPUT_IDS = [
    3037253, # Intravascular
    3010494, # Enteral
    3006552  # Oral
]

OUTPUT_IDS = [
    3014315, # Urine
    3011087, # Stool
    3026556, # Chest Tube
    3018767, # Ventricle Drain
    21491183, # GI Drain
    3020433  # Misc
]

# --- 3. Query Construction ---
# Format lists for SQL IN clauses
input_sql_list = ", ".join(map(str, INPUT_IDS))
output_sql_list = ", ".join(map(str, OUTPUT_IDS))
all_ids_sql_list = ", ".join(map(str, INPUT_IDS + OUTPUT_IDS))

# --- 4. SQL Execution ---
# Logic: Aggregates fluid data by Person, Visit, and Calendar Date.
# Calculates Daily Balance (Input - Output) and Cumulative Balance per Visit.
query_text = f"""
WITH vlist AS (
  -- Identify target cohort (ventilated patients)
  SELECT DISTINCT person_id
  FROM measurement m
  JOIN concept c ON m.measurement_concept_id = c.concept_id
  WHERE concept_name LIKE '%ventila%'
),
labeled_data AS (
  SELECT
    m.person_id,
    v.visit_occurrence_id,
    m.measurement_datetime,
    m.value_as_number,
    -- Extract Calendar Date from timestamp
    DATE(m.measurement_datetime) as report_date,
    -- Classify measurements based on configured IDs
    CASE
      WHEN m.measurement_concept_id IN ({input_sql_list}) THEN 'Input'
      WHEN m.measurement_concept_id IN ({output_sql_list}) THEN 'Output'
      ELSE NULL
    END as category
  FROM measurement m
  JOIN visit_occurrence v
    ON m.person_id = v.person_id
    AND m.measurement_datetime >= v.visit_start_datetime
    AND (v.visit_end_datetime IS NULL OR m.measurement_datetime <= v.visit_end_datetime)
  WHERE m.measurement_concept_id IN ({all_ids_sql_list})
  AND m.person_id IN (SELECT person_id FROM vlist)
),
daily_agg AS (
  SELECT
    person_id,
    visit_occurrence_id,
    report_date,
    SUM(CASE WHEN category = 'Input' THEN value_as_number ELSE 0 END) as total_input,
    SUM(CASE WHEN category = 'Output' THEN value_as_number ELSE 0 END) as total_output
  FROM labeled_data
  GROUP BY person_id, visit_occurrence_id, report_date
)
SELECT
  person_id,
  visit_occurrence_id,
  report_date,
  CAST(total_input AS INT64) as total_input,
  CAST(total_output AS INT64) as total_output,
  CAST((total_input - total_output) AS INT64) as daily_balance,
  -- Calculate Cumulative Sum partitioned by Visit
  CAST(SUM(total_input - total_output) OVER (
    PARTITION BY person_id, visit_occurrence_id
    ORDER BY report_date
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  ) AS INT64) as cumulative_balance
FROM daily_agg
ORDER BY person_id, report_date
"""

print("Executing BigQuery extraction...")
fluid_balance = client.query(query_text, job_config=job_config).to_dataframe()

# --- 5. Result Processing ---
if fluid_balance.empty:
    print("WARNING: No data returned for the specified criteria.")
else:
    # Ensure correct datetime format
    fluid_balance['report_date'] = pd.to_datetime(fluid_balance['report_date'])

    print(f"Extraction complete. Retrieved {len(fluid_balance)} records.")
    display(fluid_balance.head())

Fluid Balance: Client configured for dataset 'amsterdamumcdb.van_gogh_2026_datathon'
Executing BigQuery extraction...
Extraction complete. Retrieved 132589 records.


Unnamed: 0,person_id,visit_occurrence_id,report_date,total_input,total_output,daily_balance,cumulative_balance
0,4,27285,2017-01-01,2904,2115,789,789
1,4,27285,2017-01-02,317,630,-314,475
2,8,34050,2017-01-01,2672,2425,247,247
3,8,34050,2017-01-02,3451,2535,916,1162
4,8,34050,2017-01-03,2987,1675,1312,2474


# Merge and do your machine learning