In [1]:
from pymongo import MongoClient
import numpy as np

def get_dataset(collection_name, db_name, feature_mode: str = "rssi"):
    """
    Load dataset in the new format and choose features:
      - feature_mode="rssi"   -> freind1/2/3_rssi (3 features)
      - feature_mode="ratios" -> 6 ratio features
      - feature_mode="both"   -> 3 RSSI + 6 ratios (9 features)
    Labels remain location_x, location_y.
    """
    feature_mode = feature_mode.lower()
    assert feature_mode in {"rssi", "ratios", "both"}, "feature_mode must be 'rssi', 'ratios', or 'both'"

    client = MongoClient('mongodb://localhost:28910/',
                         connectTimeoutMS=30000,
                         socketTimeoutMS=30000,
                         maxPoolSize=20)

    db = client[db_name]
    collection = db[collection_name]

    # Base fields (labels)
    projection = {
        'location_x': 1,
        'location_y': 1,
    }

    # RSSI features
    if feature_mode in {"rssi", "both"}:
        projection.update({
            'freind1_rssi': {'$ifNull': ['$freind1_rssi', -100]},
            'freind2_rssi': {'$ifNull': ['$freind2_rssi', -100]},
            'freind3_rssi': {'$ifNull': ['$freind3_rssi', -100]},
        })

    # Ratio features
    if feature_mode in {"ratios", "both"}:
        projection.update({
            'freind1_rssi_over_freind2_rssi': {'$ifNull': ['$freind1_rssi_over_freind2_rssi', 0]},
            'freind1_rssi_over_freind3_rssi': {'$ifNull': ['$freind1_rssi_over_freind3_rssi', 0]},
            'freind2_rssi_over_freind1_rssi': {'$ifNull': ['$freind2_rssi_over_freind1_rssi', 0]},
            'freind2_rssi_over_freind3_rssi': {'$ifNull': ['$freind2_rssi_over_freind3_rssi', 0]},
            'freind3_rssi_over_freind1_rssi': {'$ifNull': ['$freind3_rssi_over_freind1_rssi', 0]},
            'freind3_rssi_over_freind2_rssi': {'$ifNull': ['$freind3_rssi_over_freind2_rssi', 0]},
        })

    # Match: require labels and whichever features we selected to be numeric
    match_stage = {
        'location_x': {'$type': 'number'},
        'location_y': {'$type': 'number'},
    }
    if feature_mode in {"rssi", "both"}:
        match_stage.update({
            'freind1_rssi': {'$type': 'number'},
            'freind2_rssi': {'$type': 'number'},
            'freind3_rssi': {'$type': 'number'},
        })
    if feature_mode in {"ratios", "both"}:
        # ratios can be 0 if missing (filled by $ifNull), still numeric
        match_stage.update({
            'freind1_rssi_over_freind2_rssi': {'$type': 'number'},
            'freind1_rssi_over_freind3_rssi': {'$type': 'number'},
            'freind2_rssi_over_freind1_rssi': {'$type': 'number'},
            'freind2_rssi_over_freind3_rssi': {'$type': 'number'},
            'freind3_rssi_over_freind1_rssi': {'$type': 'number'},
            'freind3_rssi_over_freind2_rssi': {'$type': 'number'},
        })

    pipeline = [
        {'$project': projection},
        {'$match': match_stage},
    ]

    cursor = collection.aggregate(pipeline, allowDiskUse=True, batchSize=50000)

    first_doc = None
    data = []
    for doc in cursor:

        if not first_doc:
            first_doc = doc
        
        try:
            row = []
            if feature_mode in {"rssi", "both"}:
                row.extend([
                    float(doc['freind1_rssi']),
                    float(doc['freind2_rssi']),
                    float(doc['freind3_rssi']),
                ])
            if feature_mode in {"ratios", "both"}:
                row.extend([
                    float(doc['freind1_rssi_over_freind2_rssi']),
                    float(doc['freind1_rssi_over_freind3_rssi']),
                    float(doc['freind2_rssi_over_freind1_rssi']),
                    float(doc['freind2_rssi_over_freind3_rssi']),
                    float(doc['freind3_rssi_over_freind1_rssi']),
                    float(doc['freind3_rssi_over_freind2_rssi']),
                ])
            row.extend([float(doc['location_x']), float(doc['location_y'])])
            data.append(tuple(row))
        except Exception:
            # skip malformed
            continue

    if not data:
        raise ValueError(f"No valid data found in collection {collection_name}")

    return first_doc, np.array(data, dtype=np.float32)

def validate_dataset(collection_name, db_name, feature_mode="rssi", sample_size=5):
    """
    Debug helper: show original docs, processed rows, and final numpy array.
    """
    client = MongoClient('mongodb://localhost:28910/',
                         connectTimeoutMS=30000,
                         socketTimeoutMS=30000,
                         maxPoolSize=20)
    db = client[db_name]
    collection = db[collection_name]

    # Show a few original documents directly from MongoDB
    print("=== Raw MongoDB docs (before projection) ===")
    for doc in collection.find().limit(sample_size):
        print(doc)

    # Use get_dataset to process into numpy
    print("\n=== Processed dataset rows ===")
    np_data = get_dataset(collection_name, db_name, feature_mode=feature_mode)
    for row in np_data[:sample_size]:
        print(row)

    print("\n=== NumPy array shape ===")
    print(np_data.shape)

    return np_data


def split_combined_data(combined_array, feature_mode: str = "rssi"):
    feature_mode = feature_mode.lower()
    assert feature_mode in {"rssi", "ratios", "both"}

    base_rssi = 3
    ratio_feats = 6
    if feature_mode == "rssi":
        num_features = base_rssi
    elif feature_mode == "ratios":
        num_features = ratio_feats
    else:  # both
        num_features = base_rssi + ratio_feats

    features = combined_array[:, :num_features]
    labels = combined_array[:, num_features:]
    return features, labels

def combine_arrays(arrays):
    return np.vstack(arrays)

def shuffle_array(arr, random_state=None):
    np.random.seed(random_state)
    shuffled_arr = arr.copy()
    np.random.shuffle(shuffled_arr)
    return shuffled_arr

#get_dataset("equilatero_grande_garage", "wifi_fingerprinting_data", "both")

first_entry , val_datasets = get_dataset("equilatero_grande_garage", "wifi_fingerprinting_data_exponential", feature_mode="both")
combined_val = combine_arrays(val_datasets)
shuffled_val = shuffle_array(combined_val)
features, labels = split_combined_data(shuffled_val,  feature_mode="both")

print("First entry")
print(first_entry)
print("For first data entry we can see features and labels")
print(features[0])
print(labels[0])

First entry
{'_id': ObjectId('68add02b3edd4c4f14df6555'), 'location_x': 0.5, 'location_y': -0.03333333333333327, 'freind1_rssi': -60, 'freind2_rssi': -77, 'freind3_rssi': -75, 'freind1_rssi_over_freind2_rssi': 0.7792207792207793, 'freind1_rssi_over_freind3_rssi': 0.8, 'freind2_rssi_over_freind1_rssi': 1.2833333333333334, 'freind2_rssi_over_freind3_rssi': 1.0266666666666666, 'freind3_rssi_over_freind1_rssi': 1.25, 'freind3_rssi_over_freind2_rssi': 0.974025974025974}
For first data entry we can see features and labels
[-83.         -80.         -63.           1.0375       1.3174603
   0.96385545   1.2698413    0.7590361    0.7875    ]
[0.4       0.6666667]
