In [1]:
import logging
from pathlib import Path

import numpy as np
import pandas as pd

from src.idspy.data.split import random_split
from src.idspy.data.tabular_data import TabularData, TabularSchema
from src.idspy.data.tabular_repository import TabularDataRepository
from src.idspy.services.logger import setup_logging

setup_logging()
logger = logging.getLogger(__name__)

In [2]:
rng = np.random.default_rng(42)
n = 12
df = pd.DataFrame({
    "id": range(1, n + 1),
    "age": rng.integers(18, 70, size=n),
    "income": rng.normal(30000, 8000, size=n).round(0),
    "city": rng.choice(["Roma", "Milano", "Torino"], size=n),
    "label": rng.choice([0, 1], size=n, p=[0.6, 0.4]),
})
df.set_index("id", inplace=True)

In [3]:
schema = TabularSchema(
    target="label",
    numeric=("age", "income"),
    categorical=("city",),
)

In [4]:
tab = TabularData(_base=df, _schema=schema)

logger.info(f"numeric:\n{tab.numeric}")
logger.info(f"categorical:\n{tab.categorical}")

In [5]:
num = tab.numeric
num["age"] = num["age"] + 2
num["income"] = (num["income"] * 1.05).round(0)
tab.numeric = num

cat = tab.categorical
cat["city"] = cat["city"].replace({"Milano": "MILANO"})
tab.categorical = cat

logger.info(f"features:\n{tab.features}")

In [6]:
tab_view = tab.view_from_query("city == 'MILANO' and age > 50")

logger.info(f"features:\n{tab_view.features}")

In [7]:
tab_view1 = tab.view_from_query("city == 'MILANO'")
tab_view2 = tab.view_from_query("age > 50")

tab_view = tab_view1.intersect(tab_view2)
logger.info(f"features:\n{tab_view.features}")

In [8]:
cat = tab_view.categorical
cat["city"] = cat["city"].replace({"MILANO": "Milano"})
tab_view.categorical = cat

logger.info(f"features:\n{tab_view.features}")

In [9]:
train_idx, val_idx, test_idx = random_split(
    tab.data, train_size=0.7, val_size=0.15, test_size=0.15, random_state=123
)

train_view = tab.view(train_idx)
val_view = tab.view(val_idx)
test_view = tab.view(test_idx)

logger.info(f"train size: {len(train_view)}, val size: {len(val_view)}, test size: {len(test_view)}")

In [10]:
train_tab = train_view.materialize()
assert isinstance(train_tab, TabularData)

In [11]:
out_path = Path("example_dataset.parquet")
TabularDataRepository.save(tab, out_path, include_schema=True, index=True)
tab_loaded = TabularDataRepository.load(out_path)

logger.info(f"schema: {tab_loaded.schema.to_dict()}")
logger.info(f"data:\n{tab_loaded.data.head()}")