# Normalization

In the expriment previsoly made we used 2 axis systems.

One relative to the APs, which was 4x4
One relative to the picos, which was 10x10

To this effect our first step to relativize our points will be to create 2 functions:
- One for the AP positions, which passes 4x4 to a normal value
- One for the pico positions, which passes 10x10 to a normal value

A normal value in our situation will consist of a value between -1,0 and 1
The origin for out axis system will be one of the APs
The maximum and minium of our normalization depend on the space between samples, therefore we will take our axis that goes from 0 to 1 and subdivide it into 10 segments, give we have 10 pico samples.
The size of one 1 of the segments will be out unit in the final normalized space (both in lenght and width)
With this in mind we will have 100 areas with 1 segment in length and width
In each triangle configuration we will then move the origin to one of ther vertices and map out the datapoints corresponding to where the land in regards to the origin taking into account the coordinate system made from the segments/areas.
These will be our normalized points

# Collect data

In [16]:
from pymongo import MongoClient
from IPython.display import display, Markdown
from datetime import datetime

client = MongoClient("mongodb://localhost:28910/")
db = client["wifi_data_db"]


AP_BSSID = {
    "ec:01:d5:2b:5f:e0": "Freind1",
    "ec:01:d5:27:1d:00": "Freind2",
    "ec:01:d5:28:fa:c0": "Freind3",
}

# Join Collections

In [17]:
from pymongo import MongoClient

def union_collections_preserve_duplicates(collections_to_join, output_collection):
    
    # Verify input collections exist and get counts
    input_counts = {}
    for coll_name in collections_to_join:
        if coll_name not in db.list_collection_names():
            raise ValueError(f"Collection {coll_name} does not exist")
        input_counts[coll_name] = db[coll_name].count_documents({})
        print(f"Collection {coll_name} has {input_counts[coll_name]} documents")
    
    # Create a pipeline that adds source collection info and merges
    pipelines = []
    
    # First pipeline creates the output collection with modified _id
    first_coll = collections_to_join[0]
    pipelines.append([
        {"$addFields": {
            "original_id": "$_id",
            "_id": {"$concat": [first_coll, "||", {"$toString": "$_id"}]},
            "source_collection": first_coll
        }},
        {"$out": output_collection}
    ])
    
    # Subsequent pipelines merge with modified _id
    for coll_name in collections_to_join[1:]:
        pipelines.append([
            {"$addFields": {
                "original_id": "$_id",
                "_id": {"$concat": [coll_name, "||", {"$toString": "$_id"}]},
                "source_collection": coll_name
            }},
            {"$merge": {
                "into": output_collection,
                "whenMatched": "fail",  # Shouldn't happen with our new _id scheme
                "whenNotMatched": "insert"
            }}
        ])
    
    # Execute all pipelines
    for i, (coll_name, pipeline) in enumerate(zip(collections_to_join, pipelines)):
        db[coll_name].aggregate(pipeline)
        print(f"Processed {coll_name} ({i+1}/{len(collections_to_join)})")
    
    # Verify the output
    output_count = db[output_collection].count_documents({})
    expected_total = sum(input_counts.values())
    print(f"Output collection {output_collection} has {output_count} documents")
    print(f"Expected total (sum of inputs): {expected_total}")
    
    if output_count != expected_total:
        print(f"Warning: Output count doesn't match sum of input collections")
    
    return output_count

# Example usage
union_collections_preserve_duplicates(
    collections_to_join=["wifi_client_data", "wifi_client_data_1"],
    output_collection="wifi_client_data_global"
)

Collection wifi_client_data has 67340 documents
Collection wifi_client_data_1 has 43360 documents
Processed wifi_client_data (1/2)
Processed wifi_client_data_1 (2/2)
Output collection wifi_client_data_global has 110700 documents
Expected total (sum of inputs): 110700


110700

In [18]:

def normalize_picos_coordinates(x, y, origin_x, origin_y):

 
    # Normlized interval sizes
    pico_interval = 1 / 10 
    ap_interval = 1/4   

    # Normalized locations
    normalized_x = pico_interval * x
    normalized_y = pico_interval * y
    normalized_origin_x = ap_interval * origin_x
    normalized_origin_y = ap_interval * origin_y
    
    
    return (normalized_x-normalized_origin_x, normalized_y-normalized_origin_y)


def calculate_centroid(point1, point2, point3):
    cx = (point1[0] + point2[0] + point3[0]) / 3
    cy = (point1[1] + point2[1] + point3[1]) / 3
    return (cx, cy)

In [19]:
from pymongo import MongoClient
from datetime import datetime

def transform_wifi_data(db, origin_x=None, origin_y=None, start_time=None, end_time=None, dry_run=False, output_collection_name="wifi_data_filtered", input_collection_name="wifi_data"):
    """
    Transform wifi scan data into filtered format with normalized coordinates.
    
    Args:
        db: MongoDB database object
        origin_x: Origin x-coordinate for normalization
        origin_y: Origin y-coordinate for normalization
        start_time: datetime object for start of time range (inclusive)
        end_time: datetime object for end of time range (inclusive)
        dry_run: If True, only preview changes without writing to DB
        output_collection_name: Name of the collection to write processed data into
        input_collection_name: Name of the input collection containing raw data
    """
    ap_mapping = {
        "ec:01:d5:2b:5f:e0": "AP1_rssi",
        "ec:01:d5:27:1d:00": "AP2_rssi",
        "ec:01:d5:28:fa:c0": "AP3_rssi"
    }
    
    ip_to_y = {
        31: 1, 32: 2, 33: 3, 34: 4, 35: 5,
        36: 6, 37: 7, 38: 8, 39: 9, 30: 10
    }
    
    match_stage = {}
    if start_time:
        match_stage["timestamp"] = {"$gte": start_time.timestamp()}
    if end_time:
        match_stage.setdefault("timestamp", {})["$lte"] = end_time.timestamp()
    
    collection = db[input_collection_name]

    pipeline = [
        {"$match": match_stage} if match_stage else {"$match": {}},
        {
            "$addFields": {
                "ip_ending": {
                    "$toInt": {"$arrayElemAt": [{"$split": ["$metadata.pico_ip", "."]}, 3]}
                }
            }
        },
        {
            "$project": {
                "_id": 0,
                "raw_location_x": "$metadata.button_id",
                "raw_location_y": {
                    "$switch": {
                        "branches": [
                            {"case": {"$eq": ["$ip_ending", val]}, "then": ip_to_y[val]}
                            for val in ip_to_y
                        ],
                        "default": None
                    }
                },
                "data": 1,
                "timestamp": 1
            }
        },
        {"$match": {"raw_location_y": {"$ne": None}}},
        {"$unwind": "$data"},
        {"$match": {"data.BSSID": {"$in": list(ap_mapping.keys())}}},
        {
            "$group": {
                "_id": {
                    "raw_location_x": "$raw_location_x",
                    "raw_location_y": "$raw_location_y",
                    "timestamp": "$timestamp"
                },
                **{
                    field_name: {
                        "$max": {
                            "$cond": [
                                {"$eq": ["$data.BSSID", bssid]},
                                "$data.RSSI",
                                None
                            ]
                        }
                    }
                    for bssid, field_name in ap_mapping.items()
                }
            }
        }
    ]
    
    results = list(collection.aggregate(pipeline))
    
    normalized_results = []
    for doc in results:
        raw_x = doc["_id"]["raw_location_x"]
        raw_y = doc["_id"]["raw_location_y"]
        
        norm_x, norm_y = normalize_picos_coordinates(
            raw_x, raw_y,
            origin_x if origin_x is not None else 0,
            origin_y if origin_y is not None else 0
        )
        
        if dry_run:
            new_doc = {
                "raw_location_x": raw_x,
                "raw_location_y": raw_y,
                "location_x": norm_x,
                "location_y": norm_y,
                "timestamp": doc["_id"]["timestamp"],
                **{field: doc.get(field) for field in ap_mapping.values()}
            }
        else:
            new_doc = {
                "location_x": norm_x,
                "location_y": norm_y,
                "timestamp": doc["_id"]["timestamp"],
                **{field: doc.get(field) for field in ap_mapping.values()}
            }
        normalized_results.append(new_doc)
    
    if dry_run:
        print(f"Dry run: Would process {len(normalized_results)} documents")
        if normalized_results:
            print("Sample documents:")
            doc = normalized_results[0]
            print(f"  Raw location: {doc.get('raw_location_x')},{doc.get('raw_location_y')}")
            print(f"  Normalized location: {doc['location_x']:.4f},{doc['location_y']:.4f}")
            print(f"  timestamp: {datetime.fromtimestamp(doc['timestamp'])}")
            for ap in ap_mapping.values():
                print(f"  {ap}: {doc.get(ap, 'N/A')}")
            print()
        return normalized_results
    
    if normalized_results:
        db[output_collection_name].delete_many({})
        db[output_collection_name].insert_many(normalized_results)
        print(f"Successfully processed {len(normalized_results)} documents into {output_collection_name}")
        return normalized_results
    else:
        print("No documents matched the criteria")
        return []


In [20]:
# 
triangle_dictionary = {
    "reto_grande": {
        "start":datetime(2025, 5, 13, 20, 10),
        "end":datetime(2025, 5, 13, 21, 42),
        "origin":calculate_centroid((0,0),(4,0),(0,4))
    },
    "reto_medio": {
        "start":datetime(2025, 5, 13, 21, 46),
        "end":datetime(2025, 5, 13, 22, 49),
        "origin":calculate_centroid((1,1),(3,1),(1,3))
    },
    "reto_pequeno": {
        "start":datetime(2025, 5, 13, 22, 51),
        "end":datetime(2025, 5, 13, 23, 53),
        "origin":calculate_centroid((1,1),(2,1),(1,2))
    },
    "equilatero_grande": {
        "start":datetime(2025, 6, 28, 19, 45),
        "end":datetime(2025, 6, 28, 21, 15),
        "origin":calculate_centroid((0,0),(4,0),(2,4))
    },
    "equilatero_medio": {
        "start":datetime(2025, 6, 28, 22, 5),
        "end":datetime(2025, 6, 28, 23, 30),
        "origin":calculate_centroid((1,1),(2,1),(2,3))
    },
}
collection = db["wifi_client_data"]

for triangle_name in triangle_dictionary.keys(): 
    input_collection = "wifi_client_data_global"
    current_triangle = triangle_dictionary[triangle_name]
    start_time  = current_triangle["start"]
    end_time    = current_triangle["end"]
    origin      = current_triangle["origin"]

    transform_wifi_data(db, origin[0], origin[1], start_time, end_time, dry_run=False, input_collection_name=input_collection, output_collection_name=f"wifi_data_{triangle_name}")

Successfully processed 19393 documents into wifi_data_reto_grande
Successfully processed 21140 documents into wifi_data_reto_medio
Successfully processed 23533 documents into wifi_data_reto_pequeno
Successfully processed 19318 documents into wifi_data_equilatero_grande
Successfully processed 21845 documents into wifi_data_equilatero_medio
