In [13]:
# data_processing.py
from typing import List, Sequence, Union
from pymongo import MongoClient
import numpy as np
from feature_lists import DATASET_TO_FEATURE

def get_feature_list(dataset:str) -> List[str]:
    """
    Accept either a preset name (str) or an explicit ordered list of feature keys.
    Returns a concrete list of feature names in the order they should be used.
    """
    if dataset in DATASET_TO_FEATURE:
        return DATASET_TO_FEATURE[dataset]
    raise ValueError(
        f"Unknown feature selection '{dataset}'. "
        f"Use one of {list(DATASET_TO_FEATURE.keys())} or pass a list of fields."
    )


def _default_value_for(feature_name: str) -> float:
    """
    Choose a safe numeric default for missing values.
    - RSSI-like quantities default to -100 dBm-ish.
    - Shares/ratios/power ratios and scalars default to 0.0.
    """
    name = feature_name.lower()
    if "_rssi" in name or "_rssi_1m" in name or "residual" in name:
        return -100.0
    return 0.0


def get_dataset(
    collection_name: str,
    db_name: str,
    features: Union[str, Sequence[str]],
):
    """
    Load data from MongoDB and return a NumPy array where each row is:
        [ <features...>, location_x, location_y ]

    feature_selection: preset name or explicit list of fields (order = model input order)
    """

    client = MongoClient(
        "mongodb://localhost:28910/",
        connectTimeoutMS=30000,
        socketTimeoutMS=30000,
        maxPoolSize=20,
    )
    db = client[db_name]
    collection = db[collection_name]

    # Build projection: computed fields via $ifNull to guarantee numeric values.
    projection = {
        "location_x": 1,
        "location_y": 1,
    }
    for f in features:
        projection[f] = {"$ifNull": [f"${f}", _default_value_for(f)]}

    # Keep only rows with numeric labels; features are numeric due to $ifNull above.
    pipeline = [
        {"$project": projection},
        {
            "$match": {
                "location_x": {"$type": "number"},
                "location_y": {"$type": "number"},
            }
        },
    ]

    cursor = collection.aggregate(pipeline, allowDiskUse=True, batchSize=50000)
    rows = []
    first_doc = None
    for doc in cursor:

        if not first_doc:
            first_doc = doc
        try:
            x = [float(doc[f]) for f in features]
            y = [float(doc["location_x"]), float(doc["location_y"])]
            rows.append(tuple(x + y))
        except Exception:
            # Skip malformed rows
            continue

    if not rows:
        raise ValueError(f"No valid data found in collection '{collection_name}' of DB '{db_name}'.")

    return first_doc, np.array(rows, dtype=np.float32)


def split_combined_data(
    combined_array: np.ndarray,
    features: Union[str, Sequence[str]],
):
    """
    Split stacked array into (X, y) based on the selected feature list size.
    """
    n_features = len(features)
    X = combined_array[:, :n_features]
    y = combined_array[:, n_features:]  # [location_x, location_y]
    return X, y


def combine_arrays(arrays: List[np.ndarray]) -> np.ndarray:
    return np.vstack(arrays)


def shuffle_array(arr: np.ndarray, random_state: int = None) -> np.ndarray:
    rng = np.random.default_rng(random_state)
    idx = np.arange(arr.shape[0])
    rng.shuffle(idx)
    return arr[idx]


In [14]:
db_name = "wifi_fingerprinting_data_extra_features"

feature_list = get_feature_list(db_name)

all_collections = [
    "equilatero_grande_garage",
    "equilatero_grande_outdoor",
    "equilatero_medio_garage",
    "equilatero_medio_outdoor",
    "isosceles_grande_indoor",
    "isosceles_grande_outdoor",
    "isosceles_medio_outdoor",
    "obtusangulo_grande_outdoor",
    "obtusangulo_pequeno_outdoor",
    "reto_grande_garage",
    "reto_grande_indoor",
    "reto_grande_outdoor",
    "reto_medio_garage",
    "reto_medio_outdoor",
    "reto_n_quadrado_grande_indoor",
    "reto_n_quadrado_grande_outdoor",
    "reto_n_quadrado_pequeno_outdoor",
    "reto_pequeno_garage",
    "reto_pequeno_outdoor",
]

print(f"🧰 Database in use: {db_name}")
# Uncomment to see the exact feature order:
print("Features:", feature_list)

# ---- Training data
first_entry, train_datasets = get_dataset("equilatero_grande_garage", db_name, feature_list)
#combined_train = combine_arrays(train_datasets)
#shuffled_train = shuffle_array(combined_train)
features, labels = split_combined_data(train_datasets, feature_list)


print("First entry")
print(first_entry)
print("For first data entry we can see features and labels")
print(features[0])
print(labels[0])

🧰 Database in use: wifi_fingerprinting_data_extra_features
Features: ['freind1_rssi_rssi_1m', 'freind2_rssi_rssi_1m', 'freind3_rssi_rssi_1m', 'freind1_rssi_residual', 'freind2_rssi_residual', 'freind3_rssi_residual', 'freind1_rssi', 'freind2_rssi', 'freind3_rssi', 'freind1_rssi_power_over_freind2_rssi', 'freind1_rssi_power_over_freind3_rssi', 'freind2_rssi_power_over_freind1_rssi', 'freind2_rssi_power_over_freind3_rssi', 'freind3_rssi_power_over_freind1_rssi', 'freind3_rssi_power_over_freind2_rssi', 'freind1_rssi_share', 'freind2_rssi_share', 'freind3_rssi_share', 'beta1_log10d', 'n_est', 'freind1_rssi_over_freind2_rssi', 'freind1_rssi_over_freind3_rssi', 'freind2_rssi_over_freind1_rssi', 'freind2_rssi_over_freind3_rssi', 'freind3_rssi_over_freind1_rssi', 'freind3_rssi_over_freind2_rssi']
First entry
{'_id': ObjectId('68af04c4c6f1e3ca30a27ead'), 'location_x': 0.5, 'location_y': -0.03333333333333327, 'freind1_rssi_rssi_1m': -59.95480661269403, 'freind2_rssi_rssi_1m': -78.26277706703974,