In [1]:
import polars as pl
import matplotlib.pyplot as plt

from common.constants.column_types import CPZP_SCHEMA, OZP_SCHEMA
from common.constants.column_names import SHARED_COLUMNS, OZP_COLUMNS

pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_cols(6)


def read_preskladane_data(file_path: str, schema: pl.Schema) -> pl.DataFrame:
    return pl.read_csv(
        file_path,
        null_values=["NA", ""],
        schema=schema,
    )


cpzp_df = read_preskladane_data("./DATACON_data/CPZP_preskladane.csv", CPZP_SCHEMA)
ozp_df = read_preskladane_data("./DATACON_data/OZP_preskladane.csv", OZP_SCHEMA)

In [3]:
print(f"CPZP: {len(cpzp_df['Id_pojistence'].unique())}")
print(f"OZP: {len(ozp_df['Id_pojistence'].unique())}")


print(
    f"CPZP: {len(cpzp_df.filter(pl.col('Typ_udalosti') == 'vakcinace')['Id_pojistence'].unique())}"
)
print(
    f"OZP: {len(ozp_df.filter(pl.col('Typ_udalosti') == 'vakcinace')['Id_pojistence'].unique())}"
)

CPZP: 1570780
OZP: 977485
CPZP: 765972
OZP: 502908


In [2]:
vax_or_prescription = cpzp_df.filter(
    (((pl.col("Typ_udalosti") == "vakcinace") | (pl.col("Typ_udalosti") == "předpis")))
)

non_vax_and_non_prescription = cpzp_df.filter(
    ~(((pl.col("Typ_udalosti") == "vakcinace") | (pl.col("Typ_udalosti") == "předpis")))
    | (pl.col("Typ_udalosti").is_null())
)

print(len(cpzp_df))
print(len(vax_or_prescription))
print(len(non_vax_and_non_prescription))
assert len(vax_or_prescription) + len(non_vax_and_non_prescription) == len(cpzp_df)


vax_or_prescription = ozp_df.filter(
    (((pl.col("Typ_udalosti") == "vakcinace") | (pl.col("Typ_udalosti") == "předpis")))
)

non_vax_and_non_prescription = ozp_df.filter(
    ~(((pl.col("Typ_udalosti") == "vakcinace") | (pl.col("Typ_udalosti") == "předpis")))
    | (pl.col("Typ_udalosti").is_null())
)

print(len(ozp_df))
print(len(vax_or_prescription))
print(len(non_vax_and_non_prescription))
assert len(vax_or_prescription) + len(non_vax_and_non_prescription) == len(ozp_df)

5478462
4847962
630500
3642969
2871013
771956
