# Grid search

In [57]:
import torch
from torch.utils.data import Dataset
from pymongo import MongoClient
import numpy as np


    
def process_data(data):
    """
    Preprocess the MongoDB documents into a single array with 5 columns.
    Columns: AP1_rssi, AP2_rssi, AP3_rssi, location_x, location_y
    
    Handles NaN values by:
    1. Replacing NaN RSSI values with -100 (standard for missing signal)
    2. Ensuring coordinates are always valid numbers
    """
    combined_data = []
    
    for entry in data:
        # Safely extract RSSI values, handling missing/NaN values
        rssi_values = [
            float(entry.get('AP1_rssi', -100)) if entry.get('AP1_rssi', -100) != None else -100,
            float(entry.get('AP2_rssi', -100)) if entry.get('AP2_rssi', -100) != None else -100,
            float(entry.get('AP3_rssi', -100)) if entry.get('AP3_rssi', -100) != None else -100
        ]
        
        # Validate coordinates
        try:
            x_coord = float(entry['location_x'])
            y_coord = float(entry['location_y'])
            if np.isnan(x_coord) or np.isnan(y_coord):
                continue  # Skip this entry if coordinates are invalid
        except (KeyError, ValueError):
            continue  # Skip this entry if coordinates are missing or invalid
            
        # Combine all values into one row
        combined_row = rssi_values + [x_coord, y_coord]
        combined_data.append(combined_row)
    
    # Convert to numpy array and verify no NaNs remain
    result = np.array(combined_data, dtype=np.float32)
    assert not np.isnan(result).any(), "NaN values detected in final output!"
    
    return result

def get_dataset(collection_name, db_name):
    """
    Args:
        collection_name (str): Name of the MongoDB collection to use
        db_name (str): Name of the MongoDB database
    """
    # Connect to MongoDB
    client = MongoClient('mongodb://localhost:28910/')
    db = client[db_name]
    collection = db[collection_name]
    
    # Load all data from the collection
    data = list(collection.find())
    
    # Preprocess the data to extract features and labels
    return process_data(data)


def split_combined_data(combined_array, num_ap=3):

    # Split the array into features (RSSI values) and labels (coordinates)
    features = combined_array[:, :num_ap]  # First num_ap columns are RSSI values
    labels = combined_array[:, num_ap:]    # Last 2 columns are coordinates
    
    return features, labels

def combine_arrays(arrays):
    return np.vstack(arrays)

def shuffle_array(arr, random_state=None):
    np.random.seed(random_state)
    shuffled_arr = arr.copy()
    np.random.shuffle(shuffled_arr)
    return shuffled_arr



In [58]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import product
import math
from collections import OrderedDict
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GeneratedModel(nn.Module):
    def __init__(self, input_size, output_size, architecture_config):
        super(GeneratedModel, self).__init__()
        self.layers = nn.ModuleDict()
        self.architecture_config = architecture_config
        
        # Build the network dynamically
        prev_size = input_size
        layer_counter = 1
        
        for layer_spec in architecture_config['hidden_layers']:
            # Add linear layer
            layer_size = layer_spec['units']
            self.layers[f'linear_{layer_counter}'] = nn.Linear(prev_size, layer_size)
            prev_size = layer_size
            
            # Add batch norm if specified
            if layer_spec.get('batch_norm', False):
                self.layers[f'batchnorm_{layer_counter}'] = nn.BatchNorm1d(layer_size)
            
            # Add activation
            activation = layer_spec.get('activation', 'relu')
            if activation == 'relu':
                self.layers[f'activation_{layer_counter}'] = nn.ReLU()
            elif activation == 'leaky_relu':
                self.layers[f'activation_{layer_counter}'] = nn.LeakyReLU(0.1)
            
            # Add dropout if specified
            if 'dropout' in layer_spec:
                self.layers[f'dropout_{layer_counter}'] = nn.Dropout(layer_spec['dropout'])
            
            layer_counter += 1
        
        # Output layer
        self.output_layer = nn.Linear(prev_size, output_size)
        
        # Convert ModuleDict to Sequential
        self.net = nn.Sequential(self.layers)
    
    def forward(self, x):
        features = self.net(x)
        position = self.output_layer(features)
        return position, None  # Returning None for uncertainty

# Generate all models for a config

In [59]:
def generate_model_configs(search_space):
    """
    Generate all possible model configurations from the search space
    """
    # Convert search space dict to lists of options
    keys, values = zip(*search_space.items())
    
    # Generate all combinations
    configs = []
    for combination in product(*values):
        config = dict(zip(keys, combination))
        
        # Build hidden layers specification
        hidden_layers = []
        for i in range(config['num_layers']):
            layer_spec = {
                'units': config['layer_size'],
                'batch_norm': config['batch_norm'],
                'activation': config['activation'],
                'dropout': config['dropout'] if config['use_dropout'] else None
            }
            hidden_layers.append(layer_spec)
        
        # Create final architecture config
        architecture_config = {
            'hidden_layers': hidden_layers,
            'attention': config['attention'],
            'uncertainty_estimation': config['uncertainty_estimation']
        }
        
        configs.append({
            'name': f"Model_{len(configs)+1}",
            'config': architecture_config,
            'params': config
        })
    
    return configs

# Train generated model

In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import numpy as np
import os
from tqdm import tqdm
import math


def setup(rank, world_size):
    """Initialize the distributed environment"""
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    """Clean up distributed processes"""
    dist.destroy_process_group()


def train_model(rank, world_size, model_config, X_train, y_train, X_val, y_val, 
               epochs=100, batch_size=32, learning_rate=0.001):
    """Train a model on a specific GPU"""
    setup(rank, world_size)
    
    # Split data across GPUs
    train_size = len(X_train)
    indices = list(range(rank, train_size, world_size))
    
    # Convert to tensors and move to current GPU
    X_train_tensor = torch.FloatTensor(X_train[indices]).to(rank)
    y_train_tensor = torch.FloatTensor(y_train[indices]).to(rank)
    X_val_tensor = torch.FloatTensor(X_val).to(rank)
    y_val_tensor = torch.FloatTensor(y_val).to(rank)
    
    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=batch_size // world_size,
        shuffle=True
    )
    
    # Initialize model and wrap with DDP
    model = GeneratedModel(
        input_size=X_train.shape[1],
        output_size=y_train.shape[1],
        architecture_config=model_config['config']
    ).to(rank)
    model = DDP(model, device_ids=[rank])
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    train_loss_history = []
    val_loss_history = []
    
    progress_bar = tqdm(range(epochs), desc=f'GPU {rank}: {model_config["name"]}', position=rank)
    
    for epoch in progress_bar:
        model.train()
        batch_losses = []
        
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs, _ = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            batch_losses.append(loss.item())
        
        # Calculate epoch metrics
        train_loss = np.mean(batch_losses)
        train_loss_history.append(train_loss)
        
        # Validation (only on rank 0)
        if rank == 0:
            model.eval()
            with torch.no_grad():
                val_outputs, _ = model(X_val_tensor)
                val_loss = criterion(val_outputs, y_val_tensor).item()
                val_loss_history.append(val_loss)
            
            progress_bar.set_postfix({
                'train_loss': f'{train_loss:.4f}',
                'val_loss': f'{val_loss:.4f}'
            })
    
    # Final evaluation (only on rank 0)
    if rank == 0:
        model.eval()
        with torch.no_grad():
            # Gather predictions from all GPUs
            all_train_preds = []
            all_val_preds = []
            
            for i in range(world_size):
                idx = list(range(i, len(X_train), world_size))
                X_part = torch.FloatTensor(X_train[idx]).to(rank)
                preds, _ = model(X_part)
                all_train_preds.append(preds.cpu().numpy())
                
                X_val_part = torch.FloatTensor(X_val).to(rank)
                val_preds, _ = model(X_val_part)
                all_val_preds.append(val_preds.cpu().numpy())
            
            train_preds = np.concatenate(all_train_preds)
            val_preds = np.concatenate(all_val_preds)
            
            train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
            train_mae = mean_absolute_error(y_train, train_preds)
            val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
            val_mae = mean_absolute_error(y_val, val_preds)
        
        result = {
            'model': model.module,
            'train_loss_history': train_loss_history,
            'val_loss_history': val_loss_history,
            'train_rmse': train_rmse,
            'train_mae': train_mae,
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'model_name': model_config['name'],
            'params': model_config['params']
        }
    else:
        result = None
    
    cleanup()
    return result

# Run Grid Search

In [61]:
def run_multi_gpu_grid_search(X_train, y_train, X_val, y_val, search_space, epochs=200):
    """Run grid search using all available GPUs"""
    # Generate all model configurations
    model_configs = generate_model_configs(search_space)
    
    world_size = torch.cuda.device_count()
    print(f"Found {world_size} GPUs. Using all of them for training.")
    
    results = []
    
    # Set start method only if not already set
    if mp.get_start_method(allow_none=True) is None:
        mp.set_start_method('spawn')
    
    for config in model_configs:
        print(f"\nTraining {config['name']} with config:")
        print(config['params'])
        
        # Launch training on all GPUs
        mp.spawn(
            train_model,
            args=(world_size, config, X_train, y_train, X_val, y_val, epochs),
            nprocs=world_size,
            join=True
        )
        
        # Only rank 0 returns results
        if torch.cuda.current_device() == 0:
            result = train_model(
                0, world_size, config, X_train, y_train, X_val, y_val, epochs
            )
            results.append(result)
    
    return results

def plot_programmatic_results(results):
    """Plot results from programmatic grid search"""
    num_models = len(results)
    cols = 2
    rows = math.ceil(num_models / cols)
    
    plt.figure(figsize=(15, 5 * rows))
    
    for i, result in enumerate(results, 1):
        plt.subplot(rows, cols, i)
        plt.plot(result['train_loss_history'], label='Train Loss')
        plt.plot(result['val_loss_history'], label='Validation Loss')
        
        params = result['params']
        title = (f"{result['model_name']}\n"
                f"Layers: {params['num_layers']}, Size: {params['layer_size']}\n"
                f"Act: {params['activation']}, BN: {params['batch_norm']}\n"
                f"Dropout: {params['dropout'] if params['use_dropout'] else 'No'}\n"
                f"Val RMSE: {result['val_rmse']:.2f}, Val MAE: {result['val_mae']:.2f}")
        
        plt.title(title)
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary table
    print("\nModel Performance Summary:")
    print("{:<10} {:<10} {:<10} {:<15} {:<15} {:<15} {:<15} {:<15}".format(
        "Model", "Layers", "Size", "Activation", "BatchNorm", "Dropout", 
        "Val RMSE", "Val MAE"))
    
    for result in results:
        params = result['params']
        print("{:<10} {:<10} {:<10} {:<15} {:<15} {:<15} {:<15.4f} {:<15.4f}".format(
            result['model_name'],
            params['num_layers'],
            params['layer_size'],
            params['activation'],
            "Yes" if params['batch_norm'] else "No",
            f"{params['dropout']}" if params['use_dropout'] else "No",
            result['val_rmse'],
            result['val_mae']))

# Execute Code

In [62]:
# Get datasets from all collections
datasets = [
    get_dataset("wifi_data_reto_grande", "wifi_data_db"),
    get_dataset("wifi_data_reto_pequeno", "wifi_data_db"),
    get_dataset("wifi_data_reto_medio", "wifi_data_db")
]

# Combine all datasets into one array
combined_data = combine_arrays(datasets)

# Shuffle the combined data
shuffled_data = shuffle_array(combined_data)

# Split into features and labels
#training_x, training_y = split_combined_data(shuffled_data)
#validation_x, validation_y = split_combined_data(get_dataset("wifi_data_reto_medio", "wifi_data_db"))
global_array_x, global_array_y = split_combined_data(shuffled_data)


# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(global_array_x, global_array_y, test_size=0.2, random_state=42)
#X_train, X_val, y_train, y_val = training_x, validation_x, training_y, validation_y

search_space = {
    'num_layers': [3, 4],
    'layer_size': [128, 256],
    'activation': ['relu', 'leaky_relu'],
    'batch_norm': [True, False],
    'use_dropout': [True],
    'dropout': [0.3],
    'attention': [False],
    'uncertainty_estimation': [False]
}

# Run the multi-GPU grid search
results = run_multi_gpu_grid_search(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    search_space=search_space,
    epochs=200
)

Found 6 GPUs. Using all of them for training.

Training Model_1 with config:
{'num_layers': 3, 'layer_size': 128, 'activation': 'relu', 'batch_norm': True, 'use_dropout': True, 'dropout': 0.3, 'attention': False, 'uncertainty_estimation': False}


Traceback (most recent call last):
  File [35m"<string>"[0m, line [35m1[0m, in [35m<module>[0m
    from multiprocessing.spawn import spawn_main; [31mspawn_main[0m[1;31m(tracker_fd=149, pipe_handle=151)[0m
                                                  [31m~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"/home/admindi/miniforge3/envs/location-env/lib/python3.13/multiprocessing/spawn.py"[0m, line [35m122[0m, in [35mspawn_main[0m
    exitcode = _main(fd, parent_sentinel)
  File [35m"/home/admindi/miniforge3/envs/location-env/lib/python3.13/multiprocessing/spawn.py"[0m, line [35m132[0m, in [35m_main[0m
    self = reduction.pickle.load(from_parent)
[1;35mAttributeError[0m: [35mCan't get attribute 'train_model' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>[0m


KeyboardInterrupt: 

In [None]:
# Plot results (only need to do this once)
if torch.cuda.current_device() == 0:
    plot_programmatic_results(results)