In [10]:
%%capture 
!pip install -r requirements.txt

In [31]:
from enum import Enum

class Scaling(Enum):
    INDEPENDENT = 1
    JOINT = 2


# Global variable to enable debug mode
DEBUG = True

### Data loading & pre-processing

In [51]:
import pandas as pd
import numpy as np
from sklearn import preprocessing



def preprocess_training_data(train: pd.DataFrame, scaling_strategy: Scaling) -> pd.DataFrame:
    """
    This function preprocesses the training data by:
    1. Replacing all 100 values with -110 (ensures continuity of data)
    2. Separating the RSS values from the labels
    3. Scaling the data to have zero mean and unit variance

    Parameters:
    - train: The training data to be preprocessed
    - scaling_strategy: The scaling strategy to be used (INDEPENDENT or JOINT)
    """
    
    # 1. replace all 100 values with -110 (ensures continuity of data)
    df = train.replace(100, -110)
    
    # 2. Separate the RSS values from the labels
    rssValues = df.iloc[:, :-3]
    labels = df.iloc[:, -3:]
    
    # 3. Scale the data to have zero mean and unit variance
    # This is done either independently for each AP or jointly for all APs
    if scaling_strategy == Scaling.INDEPENDENT:
        scaler = preprocessing.StandardScaler()
        # df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
        # df = df.reset_index(drop=True)
        scaled_rss = scaler.fit_transform(rssValues)
        df_scaled_rss = pd.DataFrame(scaled_rss, columns=rssValues.columns)
        df = pd.concat([df_scaled_rss, labels], axis=1)
    
    elif scaling_strategy == Scaling.JOINT:
        flattened = rssValues.values.flatten()
        global_mean = np.mean(flattened)
        global_std = np.std(flattened)
        
        scaled_rss = (rssValues - global_mean) / global_std
        df = pd.concat([scaled_rss, labels], axis=1)
        df = df.reset_index(drop=True)
    
    else: 
        raise NotImplementedError("Specified scaling strategy is not implemented, use either Scaling.INDEPENDENT or Scaling.JOINT.")
    
    return df

def get_preprocessed_data(data_path: str, training_months: list[str], num_APs: int, scaling_strategy: Scaling, floor: int) -> pd.DataFrame:
    """
    This function loads and preprocesses the training data from the specified training months and floor.

    Parameters:
    - data_path: The path to the data
    - training_months: The list of training months to be used
    - num_APs: The number of access points
    - scaling_strategy: The scaling strategy to be used (INDEPENDENT or JOINT)
    - floor: The floor to be used
    """
    # Since the csv files do not have column names, we define these first.
    list_of_APs = ["AP" + str(i) for i in range(0, num_APs)]

    # Load the training data from all specified training sets.  
    df_rss = pd.concat([pd.read_csv(data_path + training_set + 'trn01rss.csv', names=list_of_APs) for training_set in training_months])
    df_rss = df_rss.reset_index(drop=True)
    
    # Get all x,y,floor labels (gotten from data_path + training_month + 'trn01crd.csv')
    df_labels = pd.concat([pd.read_csv(data_path + training_set + 'trn01crd.csv', names=['x', 'y', 'floor']) for training_set in training_months])
    df_labels = df_labels.reset_index(drop=True)

    # Add the labels to the pre-processed data
    df_labeled = pd.concat([df_rss, df_labels], axis=1)
    
    # Filter the data to only include the specified floor
    df_labeled = df_labeled[df_labeled['floor'] == floor]

    # Pre-processing of the training data
    df_train = preprocess_training_data(df_labeled, scaling_strategy)
    
    

    return df_train
    

In [50]:
data_pathh = './data/'
training_months = ['02/', '03/', '04/', '05/']
num_APs = 620
scaling_strategy = Scaling.JOINT
floor = 3


df_train = get_preprocessed_data(data_pathh, training_months, num_APs, scaling_strategy, floor)
df_train

Unnamed: 0,AP0,AP1,AP2,AP3,AP4,AP5,AP6,AP7,AP8,AP9,...,AP613,AP614,AP615,AP616,AP617,AP618,AP619,x,y,floor
0,4.526092,11.176402,7.851247,4.134897,5.308481,4.134897,7.851247,3.743702,-0.168245,5.308481,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
1,4.330494,11.763194,9.807220,4.526092,7.068857,4.721689,9.807220,4.721689,-0.168245,6.873260,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
2,4.526092,11.958791,9.416026,4.917286,5.699676,-0.168245,9.416026,4.917286,-0.168245,6.090871,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
3,-0.168245,11.958791,9.416026,4.917286,6.286468,4.526092,9.416026,4.917286,-0.168245,6.286468,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
4,-0.168245,12.154389,9.220428,4.721689,6.090871,4.526092,9.416026,-0.168245,-0.168245,6.286468,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,6.090871,8.242442,6.286468,5.504078,4.526092,6.090871,-0.168245,5.895273,8.438039,4.526092,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
1148,5.895273,8.633636,6.090871,6.090871,-0.168245,6.677663,-0.168245,6.286468,8.438039,-0.168245,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
1149,6.482065,7.068857,5.699676,6.677663,4.917286,6.482065,-0.168245,6.677663,7.068857,-0.168245,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3
1150,6.090871,8.046844,7.655649,5.895273,-0.168245,6.286468,6.677663,6.286468,8.242442,-0.168245,...,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,-0.168245,12.913852,29.216544,3


In [52]:
get_preprocessed_data.__doc__

Help on function get_preprocessed_data in module __main__:

get_preprocessed_data(data_path: str, training_months: list[str], num_APs: int, scaling_strategy: __main__.Scaling, floor: int) -> pandas.core.frame.DataFrame
    This function loads and preprocesses the training data from the specified training months and floor.
    
    Parameters:
    - data_path: The path to the data
    - training_months: The list of training months to be used
    - num_APs: The number of access points
    - scaling_strategy: The scaling strategy to be used (INDEPENDENT or JOINT)
    - floor: The floor to be used

