### Setup

In [155]:
import sys
import os

# Add the project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
import numpy as np
import pandas as pd

from src.idspy.core.state import State
from src.idspy.data.schema import Schema, ColumnRole
from src.idspy.data.tab_accessor import TabAccessor

np.random.seed(42)
TabAccessor
n = 1000      #numero di righe del dataset simulato

# Helper lists --- servono come valori casuali per le colonne categoriche
races = ["Human", "Elf", "Dwarf", "Hobbit", "Wizard", "Orc", "Uruk-hai"]
weapons = ["Sword", "Bow", "Axe", "Dagger", "Staff", "Spear", "None"]
regions = ["Gondor", "Rohan", "Mordor", "Shire", "Rivendell", "Moria", "Lothlorien", "Isengard"]
mounts = ["Horse", "Pony", "Warg", "Great Eagle", "Oliphaunt", "None"]

# Non-binary label with imbalanced class proportions
# Target distribution: Warrior ~50%, Support ~30%, Scout ~13%, Leader ~7%
labels = (["Warrior"] * 270) + (["Support"] * 270) + (["Scout"] * 230) + (["Leader"] * 230)
np.random.shuffle(labels)   # mescola i ruoli memorizzati in etichette
labels = labels[:n]         # assicura ceh la lista abbia esattamente 30 elementi

# Create base data di pandas simulando dati numerici e categoriali, con alcune colonne categoriali e altre numeriche
data = {
    "entity_id": [f"ENT-{i:03d}" for i in range(1, n + 1)],  # identifier
    "character_race": np.random.choice(races, size=n),
    "weapon_type": np.random.choice(weapons, size=n),
    "home_region": np.random.choice(regions, size=n),
    "age_years": np.random.randint(12, 4000, size=n),
    "height_cm": np.random.normal(170, 20, size=n).round(1),
    "gold_coins": np.random.lognormal(mean=5, sigma=1.0, size=n).round(2),
    "quests_completed": np.random.poisson(lam=4, size=n),
    "courage_score": np.clip(np.random.beta(2, 1, size=n), 0, 1),
    "ring_influence": np.random.exponential(scale=50, size=n),
    "distance_traveled_km": np.random.randint(0, 15000, size=n),
    "magic_affinity": np.random.randint(0, 101, size=n),
    "threat_perception_db": np.random.uniform(30, 120, size=n).round(2),
    "year_third_age": np.random.randint(2900, 3020, size=n),
    "alliance_loyalty": np.random.choice(["Free Peoples", "Neutral", "Sauron"], size=n, p=[0.6, 0.25, 0.15]),
    "mount_type": np.random.choice(mounts, size=n),
    "role_label": labels
}

df = pd.DataFrame(data)

############CAMBIAMENTO: inserimento più aggressivo
nan_frac = 0.60  #10% di NaN
num_nans = max(1, int(n * nan_frac))

# Inject missing values (NaNs) into numeric and categorical columns
nan_numerical_cols = ["height_cm", "gold_coins", "courage_score", "ring_influence"]
nan_categorical_cols = ["weapon_type", "mount_type", "home_region"]

# Inseriamo valori mancanti (NaN) casuali nelle colonne numeriche e categoriali. per testare le funzioni di pulizia dati
for col in nan_numerical_cols:
    idx = np.random.choice(n, size=num_nans, replace=False)
    df.loc[idx, col] = np.nan

for col in nan_categorical_cols:
    idx = np.random.choice(n, size=num_nans, replace=False)
    df.loc[idx, col] = None  # missing categorical

# Inject infinite values in a couple of numeric columns
# inseriamo valori infiniti per testare funzioni di normalizzazione e pulizia che devono gestire Inf
inf_idx_1, inf_idx_2 = np.random.choice(n, size=2, replace=False)
df.loc[inf_idx_1, "ring_influence"] = np.inf
df.loc[inf_idx_2, "gold_coins"] = -np.inf

df
#DALLE MODIFICHE EFFETTUATE, NELLE COLONNE NUMERICHE E CATEGORICHE SONO AUMENTATI ANCHE SE DI POCO, 
# COME GIUSTO CHE SIA VISTO CHE è SOLO 0.1, I VALORI DI NaN E INF.

Unnamed: 0,entity_id,character_race,weapon_type,home_region,age_years,height_cm,gold_coins,quests_completed,courage_score,ring_influence,distance_traveled_km,magic_affinity,threat_perception_db,year_third_age,alliance_loyalty,mount_type,role_label
0,ENT-001,Hobbit,,,947,,97.48,2,0.594090,15.465850,1085,61,30.87,2957,Neutral,,Support
1,ENT-002,Elf,,,1119,,522.01,3,,110.506772,10649,83,36.60,2914,Free Peoples,,Scout
2,ENT-003,Wizard,,Isengard,1148,,,2,,193.700180,12587,83,50.10,2915,Free Peoples,Great Eagle,Scout
3,ENT-004,Elf,Sword,,3820,,534.68,1,0.745137,,11458,4,33.00,2996,Free Peoples,,Scout
4,ENT-005,Hobbit,Dagger,,343,,,4,,,8645,9,75.26,2946,Free Peoples,Great Eagle,Support
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ENT-996,Human,Bow,Rohan,3174,,,6,0.888222,,14617,11,31.44,2902,Free Peoples,Pony,Warrior
996,ENT-997,Hobbit,,Rivendell,1117,,,4,,,8450,7,106.77,2988,Neutral,,Support
997,ENT-998,Orc,,,2383,,,3,0.725606,,867,73,76.89,2991,Free Peoples,Great Eagle,Leader
998,ENT-999,Human,Bow,Lothlorien,3020,,,4,,,10798,15,85.92,2975,Free Peoples,,Support


In [157]:
# Lo schema serve a definire il ruolo di ogni colonna:
# TARGET → colonna target (label)
# NUMERICAL → variabili continue
# CATEGORICAL → variabili discrete
# Questo aiuta le pipeline a sapere quali trasformazioni applicare.
schema = Schema()
schema.add(["role_label"], ColumnRole.TARGET)
schema.add(["age_years", "height_cm", "gold_coins", "quests_completed", "courage_score", "ring_influence",
            "distance_traveled_km", "magic_affinity", "threat_perception_db", "year_third_age"], ColumnRole.NUMERICAL)
schema.add(["character_race", "weapon_type", "home_region", "alliance_loyalty", "mount_type"],
           ColumnRole.CATEGORICAL)

schema

Schema(roles={<ColumnRole.NUMERICAL: 'numerical'>: ['age_years', 'height_cm', 'gold_coins', 'quests_completed', 'courage_score', 'ring_influence', 'distance_traveled_km', 'magic_affinity', 'threat_perception_db', 'year_third_age'], <ColumnRole.CATEGORICAL: 'categorical'>: ['character_race', 'weapon_type', 'home_region', 'alliance_loyalty', 'mount_type'], <ColumnRole.TARGET: 'target'>: ['role_label'], <ColumnRole.FEATURES: 'features'>: ['age_years', 'height_cm', 'gold_coins', 'quests_completed', 'courage_score', 'ring_influence', 'distance_traveled_km', 'magic_affinity', 'threat_perception_db', 'year_third_age', 'character_race', 'weapon_type', 'home_region', 'alliance_loyalty', 'mount_type']}, strict=False)

In [158]:
state = State()             #è un contenitore globale dei dati e del flusso di lavoro
df.tab.set_schema(schema)   #associa lo schema al dataframe.
state["data.root"] = df     #memorizza il dataframe iniziale.
#Serve perché tutte le funzioni di pre-processing interagiscono con lo State, non direttamente con df

### Drop rows with Null values

In [159]:
# aggiunta la seguente riga per evitare errore in DropNulls causa versione precedente di python; ho 3.11 la richiesta è 3.12
from typing_extensions import override

from src.idspy.steps.transforms.adjust import DropNulls

print(
    f"Total null values before cleaning: {((df == np.inf) | (df == -np.inf) | df.isnull()).any(axis=1).sum()} | Rows: {df.shape[0]}")

step = DropNulls()
step(state)

df = state["data.root"]
print(
    f"Total null values after cleaning: {((df == np.inf) | (df == -np.inf) | df.isnull()).any(axis=1).sum()} | Rows: {df.shape[0]}")


# DropNulls rimuove tutte le righe con valori nulli (NaN) o infiniti.
# Si applica sullo State, modificando state["data.root"].
# È fondamentale per pulire il dataset prima di standardizzazione o encoding.


#CAMBIANO I VALORI DI OUTPUT: DA
# Total null values before cleaning: 8 | Rows: 30
# Total null values after cleaning: 0 | Rows: 22 

#A
#Total null values before cleaning: 16 | Rows: 30
#Total null values after cleaning: 0 | Rows: 14

Total null values before cleaning: 998 | Rows: 1000
Total null values after cleaning: 0 | Rows: 2


### Train and Test split

In [160]:
# from src.idspy.steps.transforms.split import RandomSplit

# step = RandomSplit(train_size=0.8, test_size=0.2, val_size=0)
# step(state)
# df = state["data.root"]

# print(f"Train size: {df.tab.train.shape}")
# print(f"Test size: {df.tab.test.shape}")

from src.idspy.steps.transforms.split import RandomSplit

step = RandomSplit(train_size=0.6, test_size=0.4, val_size=0)
step(state)      # Apply the split transformation
df = state["data.root"]

print(f"Train size: {df.tab.train.shape}")
print(f"Test size: {df.tab.test.shape}")

#MANTENENDO INVARIATI I VALORI DI TRAIN E SIZE I VALORI DI OUTPUT CAMBIANO: DA
#Train size: (17, 17)
#Test size: (5, 17)

#A
#Train size: (11, 17)
#Test size: (3, 17)


Train size: (1, 17)
Test size: (1, 17)


### Standardize numerical values

In [None]:
from src.idspy.steps.transforms.scale import StandardScale

### Standardize numerical values

mean_before = df.tab.numerical.values.mean()
std_before = df.tab.numerical.values.std()
print(f"Before standardization -> Mean: {mean_before:.4f}, Std: {std_before:.4f}")

# Apply scaling step
step = StandardScale()
step.fit(state)         #calcola la media e std sui dati di train
step(state)             #applica la standardizzazione anche sul test set
#Serve perché tutte le funzioni di pre-processing interagiscono con lo State, non direttamente
#è necessario applicare la standardizzazione anche sul test set per evitare data leakage
# After standardization
df = state["data.root"]
mean_after = df.tab.numerical.values.mean()
std_after = df.tab.numerical.values.std()
print(f"After standardization -> Mean: {mean_after:.4f}, Std: {std_after:.4f}")

Before standardization -> Mean: 1161.5588, Std: 2036.3322
After standardization -> Mean: -908750163.9703, Std: 1995277979.2507


### Encoding Categorical Values

In [162]:
from src.idspy.steps.transforms.map import FrequencyMap

prev = df
for col in df.tab.categorical.columns:
    print(f"Unique values of '{col}':\n{df[col].unique()}")

step = FrequencyMap(max_levels=4)
step.fit(state)
step(state)
df = state["data.root"]
# Trasforma le colonne categoriali in numeri basandosi sulla frequenza.
# Le categorie meno frequenti vengono raggruppate in “Other”.

print(f"Categorical mapping:\n{state['mapping.categorical']}")

for col in df.tab.categorical.columns:
    print(f"Unique values of '{col}':\n{df[col].unique()}")

Unique values of 'character_race':
['Elf' 'Hobbit']
Unique values of 'weapon_type':
['Bow' 'Dagger']
Unique values of 'home_region':
['Gondor' 'Shire']
Unique values of 'alliance_loyalty':
['Neutral']
Unique values of 'mount_type':
['Oliphaunt' 'Great Eagle']
Categorical mapping:
{'character_race': CategoricalDtype(categories=['Hobbit'], ordered=True, categories_dtype=object), 'weapon_type': CategoricalDtype(categories=['Dagger'], ordered=True, categories_dtype=object), 'home_region': CategoricalDtype(categories=['Shire'], ordered=True, categories_dtype=object), 'alliance_loyalty': CategoricalDtype(categories=['Neutral'], ordered=True, categories_dtype=object), 'mount_type': CategoricalDtype(categories=['Great Eagle'], ordered=True, categories_dtype=object)}
Unique values of 'character_race':
[0 1]
Unique values of 'weapon_type':
[0 1]
Unique values of 'home_region':
[0 1]
Unique values of 'alliance_loyalty':
[1]
Unique values of 'mount_type':
[0 1]


In [None]:
#questo ciclo confronta i valori delle colonne categoriali prima e dopo la trasformazione: ove df è un valore numerico che mi indica la frequenza della categoria, prev 
for col in df.tab.categorical.columns:
    print(f"\n--- Column: {col} ---")
    comparison = pd.DataFrame({
        "df": df.tab.categorical[col],
        "prev": prev.tab.categorical[col]
    })
    print(comparison)


--- Column: character_race ---
     df    prev
338   0     Elf
460   1  Hobbit

--- Column: weapon_type ---
     df    prev
338   0     Bow
460   1  Dagger

--- Column: home_region ---
     df    prev
338   0  Gondor
460   1   Shire

--- Column: alliance_loyalty ---
     df     prev
338   1  Neutral
460   1  Neutral

--- Column: mount_type ---
     df         prev
338   0    Oliphaunt
460   1  Great Eagle


### Encoding Target Values

In [164]:
from src.idspy.steps.transforms.map import LabelMap

target = df.tab.target.columns[0]

step = LabelMap()
step.fit(state)
step(state)
df = state["data.root"]
#Trasforma il target (role_label) in numeri.
#state['mapping.target'] memorizza la corrispondenza tra etichette originali e numeri.

print(f"Target mapping:\n{state['mapping.target'].categories}")

print(f"Unique values of 'original_{target}':\n{df['original_' + target].unique()}")
print(f"Unique values of '{target}':\n{df[target].unique()}")

Target mapping:
Index(['Warrior'], dtype='object')
Unique values of 'original_role_label':
['Leader' 'Warrior']
Unique values of 'role_label':
[-1  1]
