In [1]:
import logging

import numpy as np
import pandas as pd

from src.idspy.core.pipeline import FittedPipeline, Pipeline
from src.idspy.core.state import State
from src.idspy.data.tabular_data import TabularData, TabularSchema
from src.idspy.services.logger import setup_logging
from src.idspy.steps.preprocessing.adjust import DropNulls
from src.idspy.steps.preprocessing.map import TargetMap, FrequencyMap
from src.idspy.steps.preprocessing.scale import StandardScale
from src.idspy.steps.preprocessing.split import RandomSplit

setup_logging()
logger = logging.getLogger(__name__)
rng = np.random.default_rng(42)

In [2]:
data = {
    "age": [25, 37, 45, 29, 52, 33, 41, np.nan, 60, 22, 38, 47, 31, 26, 55, 43, 39, np.inf, 68, 34],
    "income": [32000, 58000, 72000, 40000, 90000, 51000, 67000, 45000, 110000, 28000, 60000, 80000, 80000, 35000,
               95000, 70000, 62000, 76000, -np.inf, 54000],
    "score": [0.81, 0.55, 0.72, 0.49, 0.91, 0.63, 0.70, 0.58, 0.96, 0.40, 0.40, 0.77, 0.60, 0.52, 0.88, 0.69, 0.66,
              0.74, 0.45, 0.61],
    "city": ["Roma", "Milano", "Napoli", "Roma", "Torino", "Bologna", "Roma", "Milano", "Roma", "Firenze", "Roma",
             "Napoli", "Cagliari", "Bari", "Roma", "Genova", "Roma", "Roma", "Trieste", "Roma"],
    "device": ["mobile", "desktop", "mobile", "tablet", "desktop", "mobile", "mobile", "desktop", "tablet", "mobile",
               "desktop", "tablet", "mobile", "mobile", "desktop", "mobile", "tablet", "desktop", "mobile", "smart_tv"],
    "target": ["suspicious", "suspicious", "fraud", "benign", "fraud", "fraud", "benign", "suspicious", "benign",
               "benign",
               "fraud", "suspicious", "suspicious", "benign", "suspicious", "fraud", "benign", "fraud", "suspicious",
               "benign"],
    "amount": [120.5, 45.0, 300.0, 89.9, 450.0, 200.0, 150.0, 99.0, 800.0, 20.0, 60.0, 250.0, 130.0, 75.0, 500.0, 220.0,
               140.0, 310.0, 55.0, 180.0],
    "is_new_user": ["yes", "no", "no", "yes", "no", "no", "yes", "yes", "no", "no", "no", "yes", "yes", "no", "no",
                    "yes", "no", "no", "yes", "no"],
}

df = pd.DataFrame(data)

df = df.sample(frac=1.0, random_state=123).reset_index(drop=True)
logger.info(df.head())

In [3]:
schema = TabularSchema(
    target="target",
    numeric=("age", "income", "score", "amount"),
    categorical=("city", "device", "is_new_user"),
)

tab = TabularData(df, schema)

In [4]:
s = State({"data": tab})

In [5]:
cat_mapper = FrequencyMap(max_levels=3)
fit_pipeline = FittedPipeline(
    steps=[
        StandardScale(),
        cat_mapper,
        TargetMap(),
    ]
)
TargetMap()

TargetMap(name='target_map', requires={'train', 'data'}, provides={'data'})

In [6]:
pipeline = Pipeline(
    steps=[
        DropNulls(),
        RandomSplit(),
        fit_pipeline
    ]
)

In [7]:
pipeline.run(s)

In [8]:
logger.info(f"Category mapping:\n{cat_mapper.cat_types}")

for c in schema.categorical:
    logger.info(s["data"].df[c].value_counts())

In [9]:
logger.info(s["data"].df[[schema.target, schema.target + "_encoded"]])