In [22]:
%%capture cap --no-stderr
print("Hello World!")

In [23]:
from datetime import datetime


now = datetime.now()
filename = now.strftime("%Y-%m-%d-%H-%M-%S") + '.txt'

with open(filename, 'w') as f:
    f.write(cap.stdout)

# PCA

In [5]:
from enum import Enum, StrEnum

class Scaling(Enum):
    INDEPENDENT = 1
    JOINT = 2

class DatasetType(StrEnum):
    TRAIN = 'trn'
    TEST = 'tst'
    VALIDATION = 'trn'

# Global variable to enable debug mode
DEBUG = True

In [6]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

def load_dataset(paths: list[str], num_APs: int, floor: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    # Since the csv files do not have column names, we define these first.
    list_of_APs = ["AP" + str(i) for i in range(0, num_APs)]
    
    # Load the data from all specified paths
    df_x = pd.concat([pd.read_csv(path + 'rss.csv', names=list_of_APs) for path in paths])
    df_x = df_x.reset_index(drop=True)
    
    # Get all x,y,floor labels
    df_y = pd.concat([pd.read_csv(path + 'crd.csv', names=['x', 'y', 'floor']) for path in paths])
    df_y = df_y.reset_index(drop=True)
    
    # Get indexes of the specified floor
    floor_indexes = df_y[df_y['floor'] == floor].index
    
    # Keep only the rows with the specified floor for both x and y, and reset the indexes
    df_x = df_x.loc[floor_indexes]
    df_x = df_x.reset_index(drop=True)
    
    # For df_y, also remove the floor column
    df_y = df_y.loc[floor_indexes]
    df_y = df_y.drop(columns=['floor'])
    df_y = df_y.reset_index(drop=True)
    
    return df_x, df_y

def preprocess_rssi(df_rssi: pd.DataFrame, scaling_strategy: Scaling):
    # Flattened dataset for easy searching
    flattened = df_rssi.values.flatten()
    
    # Minimum rssi found
    min_rssi = np.min(flattened)
    print("Minimum RSSI: ", min_rssi)
    
    # Find biggest multiple of 10 smaller than min_rssi
    replacement_rssi = np.floor((min_rssi - 1) / 10) * 10
    print("Replacement value", replacement_rssi)
    
    # Replace all 100 values with replacement_rssi
    df_rssi = df_rssi.replace(100, replacement_rssi)
    flattened = df_rssi.values.flatten() # Update flattened since we changed the dataframe
    
    # Standardization part
    if scaling_strategy == Scaling.INDEPENDENT: # Might not work
        scaler = preprocessing.StandardScaler()

        scaled_rss = scaler.fit_transform(df_rssi)
        df_scaled_rss = pd.DataFrame(scaled_rss, columns=df_rssi.columns)
        df_rssi = df_scaled_rss
    elif scaling_strategy == Scaling.JOINT:
        global_mean = np.mean(flattened)
        global_std = np.std(flattened)
        
        df_rssi = (df_rssi - global_mean) / global_std
    else:
        raise NotImplementedError("Specified scaling strategy is not implemented, use either Scaling.INDEPENDENT or Scaling.JOINT.")
    
    return df_rssi

In [7]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_paths = [
    './data/V1.0/01/trn01'
]
num_APs = 448
scaling_strategy = Scaling.JOINT
floor = 3

df_x, df_y = load_dataset(data_paths, num_APs, floor)

train_x_scaled = preprocess_rssi(df_x, scaling_strategy)

# Apply PCA to reduce dimensionality to 128 components
pca_128 = PCA(n_components=128)
train_x_pca_128 = pca_128.fit_transform(train_x_scaled)

# Apply PCA to reduce dimensionality to 64 components
pca_64 = PCA(n_components=64)
train_x_pca_64 = pca_64.fit_transform(train_x_scaled)

# Inspect the explained variance ratio
explained_variance_128 = np.sum(pca_128.explained_variance_ratio_)
explained_variance_64 = np.sum(pca_64.explained_variance_ratio_)

print(f'Explained variance by 128 components: {explained_variance_128:.2f}')
print(f'Explained variance by 64 components: {explained_variance_64:.2f}')

# Outputs the transformed datasets
print('Shape of train_x_pca_128:', train_x_pca_128.shape)
print('Shape of train_x_pca_64:', train_x_pca_64.shape)

Minimum RSSI:  -96
Replacement value -100.0
Explained variance by 128 components: 1.00
Explained variance by 64 components: 1.00
Shape of train_x_pca_128: (288, 128)
Shape of train_x_pca_64: (288, 64)
