In [10]:
%%capture 
!pip install -r requirements.txt

In [31]:
from enum import Enum

class Scaling(Enum):
    INDEPENDENT = 1
    JOINT = 2


# Global variable to enable debug mode
DEBUG = True

#### Data loading & pre-processing

In [60]:
import pandas as pd
import numpy as np
from sklearn import preprocessing



def preprocess_training_data(train: pd.DataFrame, scaling_strategy: Scaling) -> pd.DataFrame:
    """
    This function preprocesses the training data by:
    1. Replacing all 100 values with -110 (ensures continuity of data)
    2. Separating the RSS values from the labels
    3. Scaling the data to have zero mean and unit variance

    Parameters:
    - train: The training data to be preprocessed
    - scaling_strategy: The scaling strategy to be used (INDEPENDENT or JOINT)
    """
    
    # 1. replace all 100 values with -110 (ensures continuity of data)
    df = train.replace(100, -110)
    
    # 2. Separate the RSS values from the labels
    rssValues = df.iloc[:, :-3]
    labels = df.iloc[:, -3:]
    
    # 3. Scale the data to have zero mean and unit variance
    # This is done either independently for each AP or jointly for all APs
    if scaling_strategy == Scaling.INDEPENDENT:
        scaler = preprocessing.StandardScaler()
        # df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
        # df = df.reset_index(drop=True)
        scaled_rss = scaler.fit_transform(rssValues)
        df_scaled_rss = pd.DataFrame(scaled_rss, columns=rssValues.columns)
        df = pd.concat([df_scaled_rss, labels], axis=1)
    
    elif scaling_strategy == Scaling.JOINT:
        flattened = rssValues.values.flatten()
        global_mean = np.mean(flattened)
        global_std = np.std(flattened)
        
        scaled_rss = (rssValues - global_mean) / global_std
        df = pd.concat([scaled_rss, labels], axis=1)
        df = df.reset_index(drop=True)
    
    else: 
        raise NotImplementedError("Specified scaling strategy is not implemented, use either Scaling.INDEPENDENT or Scaling.JOINT.")
    
    return df

def get_preprocessed_training_data(data_path: str, training_months: list[str], num_APs: int, scaling_strategy: Scaling, floor: int) -> pd.DataFrame:
    """
    This function loads and preprocesses the training data from the specified training months and floor.

    Parameters:
    - data_path: The path to the data
    - training_months: The list of training months to be used
    - num_APs: The number of access points
    - scaling_strategy: The scaling strategy to be used (INDEPENDENT or JOINT)
    - floor: The floor to be used
    """
    # Since the csv files do not have column names, we define these first.
    list_of_APs = ["AP" + str(i) for i in range(0, num_APs)]

    # Load the training data from all specified training sets.  
    df_rss = pd.concat([pd.read_csv(data_path + training_set + 'trn01rss.csv', names=list_of_APs) for training_set in training_months])
    df_rss = df_rss.reset_index(drop=True)
    
    # Get all x,y,floor labels (gotten from data_path + training_month + 'trn01crd.csv')
    df_labels = pd.concat([pd.read_csv(data_path + training_set + 'trn01crd.csv', names=['x', 'y', 'floor']) for training_set in training_months])
    df_labels = df_labels.reset_index(drop=True)

    # Add the labels to the pre-processed data
    df_labeled = pd.concat([df_rss, df_labels], axis=1)
    
    # Filter the data to only include the specified floor
    df_labeled = df_labeled[df_labeled['floor'] == floor]

    # Pre-processing of the training data
    df_train = preprocess_training_data(df_labeled, scaling_strategy)
    
    return df_train

#! We might want to either get a specific testing set (color in blueprint) or a combination
#! Hence we need a separate function for the testing data
#TODO: Extend functionality
def get_preprocessed_test_data(data_path: str, test_months: list[str], test_set: list[str], num_APs: int, scaling_strategy: Scaling, floor: int) -> pd.DataFrame:
    """
    This function loads and preprocesses the training data from the specified training months and floor.

    Parameters:
    - data_path: The path to the data
    - test_months: The list of training months to be used
    - test_set: The list of test sets to be used
    - num_APs: The number of access points
    - scaling_strategy: The scaling strategy to be used (INDEPENDENT or JOINT)
    - floor: The floor to be used
    """
    # Since the csv files do not have column names, we define these first.
    list_of_APs = ["AP" + str(i) for i in range(0, num_APs)]

    # Load the test data from all specified test sets.  
    df_rss = pd.concat([pd.read_csv(data_path + month + 'tst' + type + 'rss.csv', names=list_of_APs) for month in test_months for type in test_set])
    df_rss = df_rss.reset_index(drop=True)
    
    # Get all x,y,floor labels
    df_labels = pd.concat([pd.read_csv(data_path + month + 'tst' + type + 'rss.csv', names=['x', 'y', 'floor']) for month in test_months for type in test_set])
    df_labels = df_labels.reset_index(drop=True)

    # Add the labels to the pre-processed data
    df_labeled = pd.concat([df_rss, df_labels], axis=1)
    
    # Filter the data to only include the specified floor
    df_labeled = df_labeled[df_labeled['floor'] == floor]

    # Pre-processing of the training data
    df_test = preprocess_training_data(df_labeled, scaling_strategy)
    
    return df_test
    

#### SETUP

In [63]:
data_path = './data/'
training_months = ['02/', '03/', '04/', '05/']
num_APs = 620
scaling_strategy = Scaling.JOINT
floor = 3

test_months = ['02/', '03/', '04/', '05/']
test_set = ['01']

df_train_full = get_preprocessed_training_data(data_path, training_months, num_APs, scaling_strategy, floor)
df_train_x = df_train_full.iloc[:, :-3] # Just the RSSI values
df_train_y = df_train_full.iloc[:, -3:-1] # Just the x and y coordinates (no floor)

#TODO: Check warnings that are thrown about empty slices! --> is the data loaded correctly?
df_test_full = get_preprocessed_test_data(data_path, test_months, test_set, num_APs, scaling_strategy, floor)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


#### MLP Networks