In [26]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import heapq
import cmath
import warnings
from enum import Enum
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score, auc, roc_curve
from scipy.cluster import vq
from collections import Counter

In [27]:
import tensorflow as tf
import tf_keras

os.environ["TF_USE_LEGACY_KERAS"]= '1' # Use legacy keras for compatibility
warnings.filterwarnings("ignore")

random_state = 42
random.seed(random_state)
np.random.seed(random_state) # predictable random numbers, for demonstration only
tf.random.set_seed(random_state) # reproducibility
os.environ['TF_DETERMINISTIC_OPS'] = '1' # make operations deterministic
os.environ['PYTHONHASHSEED'] = str(random_state) # reproducibility

In [28]:
class Compression_Method(Enum):
    XY = 1               #applies PCA on X and Y then filters (1)
    AmpPhase = 2         #applies PCA on Amplitude and Phase then filters (2)
    AmpPhaseFiltered = 3 #applies PCA on Amplitude and Phase after filtering (3)

#Modify this to change the approach used: XY, AmpPhase, AmpPhaseFiltered
method = Compression_Method.AmpPhase
scaler = StandardScaler()
ignorePhases = True
saveCSV = True

base_directory = 'results'
os.makedirs(base_directory, exist_ok=True)

# Variables and Functions definitions

In [29]:
notInterestedIndexes = list(range(-32,-28)) + list(range(0,1)) + list(range(29,32)) #null columns in the dataset
interestedIndexes = list(range(-28,0)) + list(range(1,29)) #non null columns in the dataset

w1=5 #for filtering
w2=3 #for windows
lambda1=3 #threshold

#build ground truth
t2 = 1205
lb1 = [120,360,600,900]
ub1 = [240,480,720,1080]
lb2 = [t2+l for l in [180,540,990,1500]]
ub2 = [t2+u for u in [360,750,1170,1590]]

lower_bounds = lb1+lb2
upper_bounds = ub1+ub2

In [30]:
def getGT(timestamp,lower_bounds,upper_bounds):
    # if I'm in the room in one case, or I'm crossing the entrance put 1
    for i in range (0, len(lower_bounds)):
        if (timestamp >= lower_bounds[i]) & (timestamp <= upper_bounds[i]):
            return 1
    return 0

def classify_presence(df, ycol="MuStdAmplPaper", plot_roc=False, num_iter=1000, thr=None):
    # Y are the labels that indicate if i'm passing or not
    Y = df["Label"]
    tpr = []
    fpr= []
    # thr is the threshold: if amplitude > thr, then assign to Y_pred 1 (presence), otherwise 0. Every time update the threshold
    if thr is None:
        thr= df[ycol].min()
        thr_list= []
        step = (df[ycol].max() - df[ycol].min()) / num_iter
        while thr <= df[ycol].max():
            # compute the predictions
            Y_pred = df.apply(lambda row: 1 if row[ycol] >= thr else 0, axis=1)
            tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()
            # compute True Positive Rate and False Positive rate to plot the roc curve
            tpr.append(tp/(tp+fn))
            fpr.append(fp/(fp+tn))
            thr_list.append(thr)
            thr += step
        thr = select_threshold(thr_list, df, ycol) # select the threshold that maximizes the f1 score for class 

        if plot_roc:
            plt.figure(figsize=(3,3),dpi=220)
            plt.plot(fpr, tpr)
            plt.plot([0, 1], [0, 1], color = 'green')
            plt.xlim(-0.05, 1.05)
            plt.ylim(-0.05, 1.05)
            plt.grid()
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title("ROC curve")
            plt.show()

    Y_pred_final = (df[ycol] >= thr).astype(int)
    f1_score, precision, recall = compute_f1(Y, Y_pred_final)
    if not tpr or not fpr:
        fpr, tpr, _ = roc_curve(Y, df[ycol], )
    AUC = auc(fpr, tpr)
    #accuracy = accuracy_score(Y, Y_pred)
    
    print(classification_report(Y, Y_pred_final, target_names=["absent", "present"]))
    print("AUC: ", AUC)
    print("F1 Score: ", f1_score)

    return AUC, f1_score, thr, precision, recall

def compute_f1(Y, Y_pred, label=1):
    precision = precision_score(Y, Y_pred, pos_label=label)
    recall = recall_score(Y, Y_pred, pos_label=label)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
    return f1, precision, recall

def select_threshold(thr_list, df, ycol, label=1):
    # compute the f1 score for class 1
    Y = df["Label"]
    f1_scores_class_1 = []
    for thr in thr_list:
        Y_pred = (df[ycol] >= thr).astype(int)
        f1_class_1, _, _ = compute_f1(Y, Y_pred, label)
        f1_scores_class_1.append(f1_class_1)

    # find the optimal threshold for class 1
    optimal_idx = np.argmax(f1_scores_class_1)
    optimal_thr = thr_list[optimal_idx]

    return optimal_thr

def extractWindowedFeatures(data, column_indexes = []):
    #vertical mean/std
    dataStd = data.groupby(by="TimeWindow").std()
    #dataMean = data.groupby(by="TimeWindow").mean()
    
    featuredDf = pd.DataFrame()
    featuredDf["Time"] = data["TimeWindow"].unique()
    #horizontal
    featuredDf["MuStdAmplPaper"] = dataStd[[j for j in column_indexes if j.startswith('Ampl')]].mean(axis=1).reset_index(drop=True) #Axis=1: mean over different columns -> into one col
    return featuredDf

def filterData(df, w1=3, lambda1=3):
    data = df.copy()
    col_list = [j for j in data.columns if "Ampl" in j]
    
    # Rolling window to calculate means and std deviations, shifted to exclude the current row
    rolling_means = data[col_list].rolling(window=w1, min_periods=1).mean().shift(1)
    rolling_stds = data[col_list].rolling(window=w1, min_periods=1).std().shift(1)

    # Start at second row since the first row is skipped in original logic
    for index in range(1, len(data)):
        if index % 10000 == 0: 
            print(index)
        
        for c in col_list:
            current_val = data.at[index, c]
            mean_val = rolling_means.at[index, c]
            std_val = rolling_stds.at[index, c]
            
            # Avoid division by zero
            if pd.isna(std_val) or std_val == 0:
                continue
            
            # Check if the current value is out of bounds
            if abs(current_val - mean_val) / std_val > lambda1:
                # Set value to previous row's value if condition is met
                data.at[index, c] = data.at[index-1, c]
                rolling_means.at[index, c] = data.at[index, c] # Update mean with corrected value
                
    return data

def complex_real(complex_value):
    return complex(complex_value).real

def complex_imag(complex_value):
    return complex(complex_value).imag

def complex_rebuild(real,imag):
    return (real + 1j*imag)

#Function to get top N features for each principal component
def get_top_n_features(loadings_df, n):
    top_features = {}
    for pc in loadings_df.columns:
        top_features[pc] = loadings_df[pc].abs().sort_values(ascending=False).head(n).index.tolist()
    return top_features

In [31]:
def bits_needed(source, df, num_lvls=-1, verbose=True):
    data = source.copy()
    data["TimeWindow"] = df["TimeWindow"]
    num_features = len(data.columns) - 1 #ignoring the TimeWindow column
    bits_per_feature, bits_needed_per_frame, window_feature_bits, window_total_bits = {}, {}, {}, {}
    dataset_total_bits, window_frame_bits = 0, 0
    
    for window in data["TimeWindow"].unique():
        windowed_data = data[data["TimeWindow"] == window].drop("TimeWindow", axis=1)
        window_size = len(windowed_data)
        bits = {}    
        for col in windowed_data.columns:
            num_symbols = len(windowed_data[col].unique())
            if num_lvls > 0:
                bits[col] = np.ceil(np.log2(num_lvls)).astype(int)
            else:
                bits[col] = np.ceil(np.log2(num_symbols)).astype(int)  # Number of bits to represent each symbol
            
        avg_bits_per_feature = np.mean(list(bits.values())).round(2)

        bits_per_feature[window] = avg_bits_per_feature
        bits_needed_per_frame[window] = avg_bits_per_feature * num_features
        window_feature_bits[window] = avg_bits_per_feature * window_size #Bits needed for one feature in the window
        window_total_bits[window] = avg_bits_per_feature * num_features * window_size

    #Average it out for all windows
    dataset_total_bits = sum(window_total_bits.values()).round(2)
    
    bits_per_feature = np.mean(list(bits_per_feature.values())).round(2)
    window_frame_bits = np.mean(list(bits_needed_per_frame.values())).round(2) 
    window_feature_bits = np.mean(list(window_feature_bits.values())).round(2)
    window_total_bits = np.mean(list(window_total_bits.values())).round(2)

    if verbose:
        print(f"{"B" if num_lvls>0 else "Average b"}its needed per feature: {bits_per_feature} bits")
        print(f"{"B" if num_lvls>0 else "Average b"}its needed per frame{"" if num_lvls>0 else " per window"}: {window_frame_bits} bits")
        print(f"{"B" if num_lvls>0 else "Average b"}its needed per feature per window: {window_feature_bits} bits")
        print(f"{"B" if num_lvls>0 else "Average b"}its needed per window: {window_total_bits} bits")
        print(f"Total bits needed for the dataset: {dataset_total_bits} bits")

    return bits_per_feature, window_frame_bits, window_feature_bits, window_total_bits, dataset_total_bits

In [32]:
def f1_loss(data, max_accuracy):
    percentage = data['test_f1'].clip(upper=max_accuracy) * 100
    loss = (max_accuracy * 100) - percentage
    data['f1_loss'] = loss
    return data

## Data Processing

### CSI Data Creation

In [33]:
class CsiData(tf_keras.utils.Sequence):
    def __init__(self, csi, labels, time_windows, batch_size, window_size):
        self.csi = csi
        self.labels = labels
        self.batch_size = batch_size
        self.window_size = window_size
        self.time_windows = time_windows
        self.indices = tf.range(0, csi.shape[0] - self.window_size, dtype=tf.int32)
        self.antennas = 1

    def __len__(self):
        return int(np.ceil(self.indices.shape[-1] / float(self.batch_size)))

    def __getitem__(self, batch_idx):
        first_idx = batch_idx * self.batch_size
        last_idx = (batch_idx + 1) * self.batch_size

        data_batch = [self.csi[x:x + self.window_size, ...] for x in range(first_idx, last_idx)]
        labels_batch = np.transpose([self.labels[first_idx:last_idx]])
        
        data_batch = tf.convert_to_tensor(data_batch)
        labels_batch = tf.convert_to_tensor(labels_batch)

        if self.antennas == 1:
            data_batch = tf.expand_dims(data_batch, 3)
            labels_batch = tf.expand_dims(labels_batch, 2)

        return data_batch, labels_batch

In [34]:
def CsiData_generator(train_windows, train_labels, test_windows, test_labels, window_size, batch_size, verbose=False):  
    #convert data in tensor
    train_data_tf = tf.convert_to_tensor(train_windows, dtype=tf.float32)
    test_data_tf = tf.convert_to_tensor(test_windows, dtype=tf.float32)

    #save time window column
    train_time_windows_tf = tf.reshape(train_data_tf[:, :, -1], (-1))
    test_time_windows_tf = tf.reshape(test_data_tf[:, :, -1], (-1))

    #remove time window column
    train_data_tf = train_data_tf[:, :, :-1]
    test_data_tf = test_data_tf[:, :, :-1]

    #reshape
    num_features = train_data_tf.shape[2]
    train_data_tf = tf.reshape(train_data_tf, (-1, num_features))
    test_data_tf = tf.reshape(test_data_tf, (-1, num_features))

    #normalize based on the max value of the train data
    max_val = tf.math.reduce_max(train_data_tf, axis=(0, 1))
    train_data_tf = tf.math.divide(train_data_tf, max_val)
    test_data_tf = tf.math.divide(test_data_tf, max_val)

    train_labels_tf = tf.convert_to_tensor(train_labels, dtype=tf.int32)
    test_labels_tf = tf.convert_to_tensor(test_labels, dtype=tf.int32)

    if verbose:
        print(f"Final train_data size: {train_data_tf.shape}")
        print(f"Final train_labels size: {train_labels_tf.shape}")
        print(f"Final test_data size: {test_data_tf.shape}")
        print(f"Final test_labels size: {test_labels_tf.shape}")

    train_data = CsiData(train_data_tf, train_labels_tf, train_time_windows_tf, batch_size, window_size)
    test_data = CsiData(test_data_tf, test_labels_tf, test_time_windows_tf, batch_size, window_size)

    return train_data, test_data

In [35]:
def time_windowing(data, time_window):
    data["TimeWindow"] = np.floor(data["Timestamp"] / time_window) * time_window # Create time windows
    data.drop(columns=["Timestamp"], inplace=True)
    return data

def data_labeling(data):
    labels = pd.DataFrame()
    labels["TimeWindow"] = data["TimeWindow"].unique() # Get unique time windows
    labels["Label"] = labels["TimeWindow"].apply(lambda x: getGT(x, lower_bounds, upper_bounds)) #assign the ground-truth to a label
    return labels

In [36]:
def data_windowing(data, time_window, window_size, step_size, verbose=False):
    data = time_windowing(data, time_window)
    labels = data_labeling(data)

    train_windows, train_labels, test_windows, test_labels, = [], [], [], []

    # Drop time windows with less than "window_size" samples
    filtered_df = data.groupby('TimeWindow').filter(lambda x: len(x) >= window_size)
    if verbose: print("Number of time windows dropped:", len(data['TimeWindow'].unique()) - len(filtered_df['TimeWindow'].unique()))

    for time_window in filtered_df["TimeWindow"].unique():
        label = labels[labels["TimeWindow"] == time_window]["Label"].values[0]
        data_window = filtered_df[filtered_df["TimeWindow"] == time_window]#.drop("TimeWindow", axis=1)
        data_window = np.array(data_window)
        train, test = train_test_split(data_window, test_size=0.4, random_state=42, shuffle=False, stratify=None)

        # Generate overlapping train windows
        for start_idx in range(0, len(train) - window_size + 1, step_size):
            window = train[start_idx : start_idx + window_size]
            train_windows.append(window)
            train_labels.extend([label] * window_size)

        # Generate overlapping test windows
        for start_idx in range(0, len(test) - window_size + 1, step_size):
            window = test[start_idx : start_idx + window_size]
            test_windows.append(window)
            test_labels.extend([label] * window_size)

    train_windows = np.array(train_windows)
    train_labels = np.array(train_labels)

    test_windows = np.array(test_windows)
    test_labels = np.array(test_labels)

    if verbose:
        print("Original number of windows:", len(filtered_df["TimeWindow"].unique()))
        print("Number of train windows:", train_windows.shape)   
        print("Number of train labels:", train_labels.shape)
        print("Number of test windows:", test_windows.shape)
        print("Number of test labels:", test_labels.shape)

    train_data, test_data = CsiData_generator(train_windows, train_labels, test_windows, test_labels, window_size, step_size, verbose=verbose)

    return train_data, test_data

In [37]:
def get_windows_stats(df):
    timewindow_counts = df['TimeWindow'].value_counts(sort=False)
    
    mean = int(timewindow_counts.mean())
    half_mean = mean // 2
    double_mean = mean * 2
    print("average count:",mean)
    print("max count:",timewindow_counts.max())
    print("min count:",timewindow_counts.min())

    counter = [x for x in timewindow_counts if x <half_mean]
    print(f"number of windows with less than {half_mean} samples: ",len(counter))
    print(sorted(counter))

    counter = [x for x in timewindow_counts if x >= half_mean and x < mean]
    print(f"number of windows between {half_mean} and {mean}: ",len(counter))
    print(sorted(counter))

    counter = [x for x in timewindow_counts if x >= mean and x < double_mean]
    print(f"number of windows between {mean} and {double_mean}: ",len(counter))
    print(sorted(counter))

    counter = [x for x in timewindow_counts if x >= double_mean]
    print(f"number of windows with more than {double_mean} samples: ",len(counter))
    print(sorted(counter))

### PCA Compression

In [38]:
def data_preprocessing(df, method):
    df['Timestamp'] = round(df['Timestamp'], 4)
    data = df.copy()
    
    columns_to_drop = (['Frame_num', 'Source_address', 'TimeWindow'] + 
                    [f"Phase{i}" for i in notInterestedIndexes] + 
                    [f"Ampl{i}" for i in notInterestedIndexes] + 
                    [f"CSI{i}" for i in notInterestedIndexes])
    data.drop(columns=columns_to_drop, inplace=True)

    if ignorePhases:
        data.drop(columns=[col for col in data.columns if col.startswith('Phase')], inplace=True); #Removes Phase columns

    if method == Compression_Method.XY:  
        for j in interestedIndexes:
            data[f'X{j}'] = data[f"CSI{j}"].apply(complex_real)
            data[f'Y{j}'] = data[f"CSI{j}"].apply(complex_imag)
        data.drop(columns=[col for col in data.columns if col.startswith(('Ampl', 'Phase'))], inplace=True); #Removes Ampl and Phase columns
    elif method == Compression_Method.AmpPhaseFiltered:
        data = filterData(data)

    data.drop(columns=[col for col in data.columns if col.startswith('CSI')], inplace=True); #Removes CSI columns
    print("Number of features:", len(data.columns))
    
    return data

Apply PCA, check the explained variance ratio and the cumulative explained variance ratio

In [39]:
def analyze_PCA(data, n_components, directory, saveGraph=False, plotGraph=True):
    os.makedirs(directory, exist_ok=True)
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(data)

    reduced_df = pd.DataFrame(data=reduced_data, columns=[f'PC{i}' for i in range(n_components)])

    #Explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_
    print("Explained variance ratio:", explained_variance_ratio)

    #Cumulative explained variance
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)
    print("Final Cumulative Explained Variance:", cumulative_explained_variance[-1])

    if (plotGraph):
        plt.figure(figsize=(10, 5))
        plt.plot(range(1, n_components + 1), cumulative_explained_variance, marker='o', linestyle='--')
        plt.title('Cumulative Explained Variance by PCA Components')
        plt.xlabel('Number of Principal Components')
        plt.ylabel('Cumulative Explained Variance')
        plt.grid()
        if (saveGraph):
            graph_path = os.path.join(directory, 'cumulative_explained_variance.png')
            plt.savefig(graph_path)
            print("Graph saved in: ", graph_path)
        plt.show()
    
    return reduced_df, pca

For each Principal Component, find the top "n" features that contribute most to the variance of that component.

In [40]:
def analyze_PC(data, pca, n_components):
    loadings = pca.components_
    loadings_df = pd.DataFrame(data=loadings.T, index=data.columns, columns=[f'PC{i+1}' for i in range(loadings.shape[0])])
    column = []

    top_n_features = get_top_n_features(loadings_df, n_components)

    for pc, features in top_n_features.items():
        #print(f"Top {n_components} features for {pc}: {features}") #uncomment to see the top features per PC
        for feature in features:
            if feature not in column:
                column.append(feature)
    print("available features: ", len(data.columns))
    print("features used: ", len(column))

    difference = set(data.columns) - set(column)
    print("Unused Features:", difference)

    return difference

### Lloyd-Max Quantization 

In [41]:
def lloyd_max_quantization(data, num_levels=16, max_iter=100, delta=1e-6):
    min_val = np.min(data)
    max_val = np.max(data)
    centroids = np.linspace(min_val, max_val, num_levels) #Uniformly spaced 

    for _ in range(max_iter):
        thresholds = (centroids[:-1] + centroids[1:]) / 2 #Defines intervals of centroids
        indices = np.digitize(data, thresholds) #Assign each data point to a cluster
        
        new_centroids = np.array([data[indices == i].mean() for i in range(num_levels)]) #Update centroids to better represent the data
        
        empty_centroids = np.isnan(new_centroids) #Restore previous cluster if empty
        new_centroids[empty_centroids] = centroids[empty_centroids] 

        #stop if changes between iterations are small
        if np.max(np.abs(new_centroids - centroids)) < delta:
            break

        centroids = new_centroids

    quantized_data = centroids[indices]   #Quantize the data based on the final centroids
    indices = indices.reshape(data.shape) #Reshape indices to match the original data shape

    return quantized_data, centroids, indices

def dequantize_lloyd_max(quantized_data, clusters, thresholds):
    indices = np.digitize(quantized_data, thresholds, right=True)
    return clusters[indices]

def apply_quantization(reduced_df, lvls):
    quantized_data, centroids, indices = lloyd_max_quantization(reduced_df.values, num_levels=lvls)
    df_quantized = pd.DataFrame(quantized_data, columns=reduced_df.columns)
    return df_quantized, centroids, indices

def apply_existing_quantization(data, centroids):
    thresholds = (centroids[:-1] + centroids[1:]) / 2
    indices = np.digitize(data, thresholds)
    quantized_data = centroids[indices] #Quantize the data based on the final centroids
    indices = indices.reshape(data.shape) #Reshape indices to match the original data shape

    df_quantized = pd.DataFrame(quantized_data, columns=data.columns)
    return df_quantized, indices

In [42]:
def apply_vector_quantization(source, num_levels):
    data = source.copy()
    codebook, _ = vq.kmeans(data, num_levels, seed=random_state)
    quantized_data, _ = vq.vq(data, codebook)
    return quantized_data, codebook

def apply_existing_vector_quantization(source, codebook):
    data = source.copy()
    quantized_data, _ = vq.vq(data, codebook)
    return quantized_data

def get_quantized_data(source, codebook):
    data = source.copy()
    quantized_data = codebook[data]
    return quantized_data

### Entropy Coding (Huffman)

In [43]:
class Node: 
    def __init__(self, value=None, frequency=0, left=None, right=None):
        self.value = value
        self.frequency = frequency
        self.left = left
        self.right = right

    def __lt__(self, other): #redefined "less than" operator for heapq
        return self.frequency < other.frequency

def build_tree(data):
    heap = [Node(value, frequency) for value, frequency in data.items()]  #Init heap
    heapq.heapify(heap)

    while len(heap) > 1:  #pop two smallest nodes, merge them and push the merged node back
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        merged = Node(frequency=left.frequency + right.frequency, left=left, right=right)
        heapq.heappush(heap, merged) 

    return heap[0] #root

def generate_codes(node, code="", huffman_codes=None):
    if huffman_codes is None: 
        huffman_codes = {}

    if node.value is not None: #leaf node case
        huffman_codes[node.value] = code
    else:
        generate_codes(node.left, code + "0", huffman_codes)
        generate_codes(node.right, code + "1", huffman_codes)
    return huffman_codes

def encode_huffman(data, huffman_codes):
    emptyStr = ""
    return emptyStr.join([huffman_codes[val] for val in data]) 

def decode_huffman(encoded_data, huffman_codes):
    decoded_data = []
    code = ""
    for bit in encoded_data: #traverse the encoded data and searches for the code
        code += bit
        for key, value in huffman_codes.items():
            if value == code: #If found, append the corresponding value to the decoded data, otherwise add another bit to the code
                decoded_data.append(key)
                code = ""
                break
                
    return decoded_data

def apply_huffman_encode_per_feature(data):
    encoded_df = pd.DataFrame()
    huffman_codes = {}

    for col in data.columns:
        freq_per_data = Counter(data[col]) 
        if len(freq_per_data) == 1: #If only one unique value, there's no tree, assign it a code of 0
            code = {list(freq_per_data.keys())[0]: '0'}
        else:
            root = build_tree(freq_per_data)
            code = generate_codes(root)
        encoded_df[col] = data[col].apply(lambda x: encode_huffman([x], code))
        huffman_codes[col] = code

    return encoded_df, huffman_codes

def apply_huffman_decode_per_feature(encoded_data, huffman_codes):
    decoded_df = pd.DataFrame()
    for col in encoded_data.columns:
        decoded_df[col] = decode_huffman(''.join(encoded_data[col]), huffman_codes[col])
    return decoded_df

def apply_encoding(df_quantized):
    encoded_df, huffman_codes = apply_huffman_encode_per_feature(df_quantized)
    return encoded_df, huffman_codes

def apply_decoding(encoded_df, huffman_codes):
    decoded_df = apply_huffman_decode_per_feature(encoded_df, huffman_codes)
    return decoded_df

In [44]:
def compute_entropy(data, verbose=True):
    entropy = 0
    for col in data.columns:
        freq_per_data = Counter(data[col])  # Get frequency of each unique value
        total_count = sum(freq_per_data.values())
        col_entropy = 0
        for count in freq_per_data.values():
            p_i = count / total_count  # probability of each unique value
            col_entropy += -p_i * np.log2(p_i)  # Entropy formula
        if verbose: print(f"Entropy of column {col}: {col_entropy} bits")
        entropy += col_entropy
    return entropy.round(2)

### Reconstruction

Reconstruct the dataset (without CSI components) and save it in csv

In [45]:
def reconstruct_data(df, pca, scaler, columns):
    df_reconstructed = pca.inverse_transform(df.values)
    df_reconstructed = scaler.inverse_transform(df_reconstructed)
    df_reconstructed = pd.DataFrame(df_reconstructed, columns=columns)

    if method == Compression_Method.XY:
        for j in interestedIndexes:
            df_reconstructed[f'CSI{j}'] = df_reconstructed.apply(lambda x: complex_rebuild(x[f'X{j}'], x[f'Y{j}']), axis=1)
                
            #compute back ampl and phases
            df_reconstructed[f'Ampl{j}'] = df_reconstructed[f'CSI{j}'].apply(abs)
            df_reconstructed[f'Phase{j}'] = df_reconstructed[f'CSI{j}'].apply(cmath.phase)

        df_reconstructed.drop(columns=[f'X{j}' for j in interestedIndexes], inplace=True)
        df_reconstructed.drop(columns=[f'Y{j}' for j in interestedIndexes], inplace=True)
        
    return df_reconstructed


In [46]:
def plot_MuStdAmplPaper(data, reconstructed_data, directory, level):
    print("Plotting MuStdAmplPaper")
    sub_directory = os.path.join(directory, 'MuStdAmplPaper_Comparison')
    os.makedirs(sub_directory, exist_ok=True)
    plt.figure(figsize=(20, 6))
    
    plt.plot(data['Time'], data['MuStdAmplPaper'], label=f'Original MuStdAmplPaper', color="blue" )
    plt.plot(reconstructed_data['Time'], reconstructed_data['MuStdAmplPaper'], label=f'Reconstructed MuStdAmplPaper', color="green")
    
    gt = [min(data['MuStdAmplPaper']) if l == 0 else max(data['MuStdAmplPaper']) for l in reconstructed_data["Label"]]
    plt.plot(data['Time'],gt,label="Ground-truth",color="r",ls="--", linewidth=0.5) # per window GT

    
    # Add plot details
    plt.title('Amplitude Comparison')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.grid()
    if (saveCSV): plt.savefig(os.path.join(sub_directory, f'{level}.png'))
    plt.show()

### Classification (partially from another notebook):

In [47]:
def load_comparison():
    filteredFeaturesPresence = pd.read_csv("../../datasets/filteredFeaturesPresence3s.csv")
    #apply labeling based on GT
    filteredFeaturesPresence["Label"]= filteredFeaturesPresence["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    AUC, f1, thr, precision, recall = classify_presence(filteredFeaturesPresence, plot_roc=False)
    return AUC, f1, thr, precision, recall

In [48]:
def apply_filtering(df_reconstructed):
    reconstructedPresence = df_reconstructed

    if method == Compression_Method.AmpPhaseFiltered:
        reconstructed_filtered = reconstructedPresence
    else:
        reconstructed_filtered = filterData(reconstructedPresence) #removes outliers
    reconstructed_filtered.drop(columns=[col for col in reconstructed_filtered.columns if col.startswith('CSI')], inplace=True); #Removes CSI columns

    return reconstructed_filtered

In [49]:
def apply_classification(reconstructed_filtered):
    #compute features
    reconstructed_featured = extractWindowedFeatures(reconstructed_filtered,column_indexes = reconstructed_filtered.columns,w2=w2)
    reconstructed_featured["Label"] = reconstructed_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds)) #assign the ground-truth to a label

    #classify
    auc_value = classify_presence(reconstructed_featured,plot_roc=False)
    print(auc_value)

    return reconstructed_featured, auc_value

# Runs

In [None]:
time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)
time_windows = pd.DataFrame(test_data.time_windows.numpy().flatten(), columns=["TimeWindow"])

bit_results = []
ORI_feature, ORI_frame, ORI_window_feature, ORI_window, ORI_total = bits_needed(df_csi_test, time_windows, verbose=True)

bit_results.append({
    'training_length': len(df_csi_train),
    'testing_length': len(df_csi_test),
    'ORI_feature_test': ORI_feature,
    'ORI_frame_test': ORI_frame,
    'ORI_window_test': ORI_window,
    'ORI_total_test': ORI_total,
})

bit_results = pd.DataFrame(bit_results)
bit_results.to_csv(f'./results_csv/original_bits.csv', index=False)

Number of features: 57
Average bits needed per feature: 5.96 bits
Average bits needed per frame per window: 333.97 bits
Average bits needed per feature per window: 414.44 bits
Average bits needed per window: 23208.82 bits
Total bits needed for the dataset: 4989895.68 bits


## Scalar Quantization

### Comprehensive Run

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/SQ'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)

results = []
og_f1 = load_comparison()

for num_components in components:
    print(f"-------------- {num_components} components --------------")
    df_train = df_csi_train.copy()
    df_test = df_csi_test.copy()
    dumps_directory = f'./results/SQ/{num_components}_components'
    os.makedirs(dumps_directory, exist_ok=True)

    scaled_train = scaler.fit_transform(df_train)
    scaled_test = scaler.transform(df_test)

    #Apply PCA
    scaled_train, pca = analyze_PCA(scaled_train, num_components, directory=dumps_directory, saveGraph=True, plotGraph=True)
    test_reduced = pca.transform(scaled_test)
    scaled_test = pd.DataFrame(test_reduced, columns=[f'PC{i}' for i in range(num_components)])

    for num_levels in levels:
        print(f"-------------- {num_components} components w/ {num_levels} lvls --------------")
        sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
        os.makedirs(sub_directory, exist_ok=True)
        filename = f'{num_levels}_lvls'
        
        #Quantize the data
        df_train_quantized, centroids, train_indices = apply_quantization(scaled_train, num_levels) #LLoyd-Max quantization
        df_test_quantized, test_indices = apply_existing_quantization(scaled_test, centroids)

        # Encode-Decode
        encoded_df, huffman_codes = apply_encoding(df_test_quantized)
        df = apply_decoding(encoded_df, huffman_codes)

        #Reconstruct train data
        df_train_reconstructed = reconstruct_data(df_train_quantized, pca, scaler, csi_subcarriers)
        df_test_reconstructed = reconstruct_data(df_test_quantized, pca, scaler, csi_subcarriers)
        train_reconstructed = tf.convert_to_tensor(df_train_reconstructed.to_numpy(), dtype=tf.float32)
        test_reconstructed = tf.convert_to_tensor(df_test_reconstructed.to_numpy(), dtype=tf.float32)
        train_data.csi = train_reconstructed
        test_data.csi = test_reconstructed

        #Feature Extraction
        df_train_reconstructed["TimeWindow"] = train_data.time_windows.numpy().flatten()
        df_test_reconstructed["TimeWindow"] = test_data.time_windows.numpy().flatten()
        train_featured = extractWindowedFeatures(df_train_reconstructed, column_indexes=csi_subcarriers)     
        test_featured = extractWindowedFeatures(df_test_reconstructed, column_indexes=csi_subcarriers)

        #Find the best threshold
        train_featured["Label"] = train_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
        train_f1, threshold = classify_presence(train_featured, plot_roc=False)   
        
        #Classify test set
        test_featured["Label"] = test_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
        test_f1, _ = classify_presence(test_featured, plot_roc=False, thr=threshold)

        results.append({
            'num_components': num_components,
            'num_levels': num_levels,
            'threshold': threshold,
            'f1_train': train_f1,
            'f1_test': test_f1,
            'original_f1': og_f1
        })
         
        if (saveCSV):
            encoded_df.to_csv(os.path.join(sub_directory, 'encodedQuantizedPCAPresence_test.csv'), index=False)
            df_test_reconstructed.to_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'), index=False)
            test_featured.to_csv(os.path.join(sub_directory, 'filteredFeaturesLabeledPresence_reconstructed_test.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'accuracy.csv')
results_df.to_csv(results_file, index=False)

print("Done")

#### Bits

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/SQ'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)
time_windows = pd.DataFrame(test_data.time_windows.numpy().flatten(), columns=["TimeWindow"])

results = []
og_f1 = load_comparison()

for num_components in components:
    print(f"-------------- {num_components} components --------------")
    df_train = df_csi_train.copy()
    df_test = df_csi_test.copy()
    dumps_directory = f'./results/SQ/{num_components}_components'
    os.makedirs(dumps_directory, exist_ok=True)

    scaled_train = scaler.fit_transform(df_train)
    scaled_test = scaler.transform(df_test)

    #Apply PCA
    scaled_train, pca = analyze_PCA(scaled_train, num_components, directory=dumps_directory, saveGraph=True, plotGraph=True)
    test_reduced = pca.transform(scaled_test)
    scaled_test = pd.DataFrame(test_reduced, columns=[f'PC{i}' for i in range(num_components)])
    
    #break
    PCA_feature, PCA_frame, PCA_window_feature, PCA_window, PCA_total = bits_needed(scaled_test, time_windows, verbose=True)

    for num_levels in levels:
        sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
        os.makedirs(sub_directory, exist_ok=True)

        # Lloyd-Max Quantization
        df_train_quantized, centroids, train_indices = apply_quantization(scaled_train, num_levels)
        df_test_quantized, test_indices = apply_existing_quantization(scaled_test, centroids)

        print("DF_QUANTIZED")
        df_indices = pd.DataFrame(test_indices, columns=[f'PC{i}' for i in range(num_components)])
        QT_feature, QT_frame, QT_window_feature, QT_window, QT_total = bits_needed(df_indices, time_windows, num_levels, verbose=True)

        # Encode-Decode
        df_encoded_test, huffman_codes_test = apply_encoding(df_test_quantized)
        entropyENC = compute_entropy(encoded_df, verbose=False)
        print("DF_ENCODED")
        ENC_feature, ENC_frame, ENC_window_feature, ENC_window, ENC_total = bits_needed(df_encoded_test, time_windows, verbose=True)
        
        # Reconstruction
        reconstructed_df = pd.read_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'))
        entropyREC = compute_entropy(reconstructed_df, verbose=False)
        print("DF_RECONSTRUCTED")
        REC_feature, REC_frame, REC_window_feature, REC_window, REC_total = bits_needed(reconstructed_df.iloc[:, :-1], time_windows, verbose=True)

        results.append({
            'num_components': num_components,
            'num_levels': num_levels,
            'PCA_feature': PCA_feature,
            'QT_feature': QT_feature,
            'ENC_feature': ENC_feature,
            'REC_feature': REC_feature,
            'PCA_frame': PCA_frame,
            'QT_frame': QT_frame,
            'ENC_frame': ENC_frame,
            'REC_frame': REC_frame,
            'QT_window': QT_window,
            'PCA_window': PCA_window,
            'ENC_window': ENC_window,
            'REC_window': REC_window,
            'PCA_total': PCA_total,
            'QT_total': QT_total,
            'ENC_total': ENC_total,
            'REC_total': REC_total
        })

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'bits.csv')
results_df.to_csv(results_file, index=False)

#### Graphs

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]
levels = [2**i for i in range(1, 9)]

results_directory = './results_csv/SQ'
graphs_directory = './results_graphs/SQ'
os.makedirs(graphs_directory, exist_ok=True)

In [None]:
og_f1 = pd.read_csv(os.path.join(results_directory, 'accuracy.csv'))
bit = pd.read_csv(os.path.join(results_directory, 'bits.csv'))
f1_bit = og_f1.merge(bit, on=['num_components', 'num_levels'])
max_accuracy = f1_bit['original_f1'].max()
f1_bit = f1_loss(f1_bit, max_accuracy)
f1_bit.to_csv(os.path.join(results_directory, 'results.csv'), index=False)

In [None]:
results_directory = './results_csv/SQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['QT_frame'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
#plt.ylim(0.7)
plt.title('Relative F1-Score loss with quantized data and PCA applied')
plt.xlabel('Bits per frame')
plt.xticks(np.arange(0, 160, 10))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/SQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['QT_window'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
#plt.ylim(0.7)
plt.title('Relative F1-Score loss with quantized data and PCA applied')
plt.xlabel('Bits per window')
plt.xticks(np.arange(0, 11000, 500))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit[BxW].png'))
plt.show()

In [None]:
results_directory = './results_csv/SQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['ENC_frame'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
#plt.ylim(0.7)
plt.title('Relative F1-Score loss with quantized and encoded data and PCA applied')
plt.xlabel('Average bits per frame')
plt.xticks(np.arange(0, 62, 2))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/SQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['ENC_window'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
#plt.ylim(0.7)
plt.title('Relative F1-score loss with quantized and encoded data and PCA applied')
plt.xlabel('Average bits per window')
plt.xticks(np.arange(0, 4500, 250))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded[BxW].png'))
plt.show()

### PCA ONLY

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/SQ/PCA_Only'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)

results = []
og_f1 = load_comparison()

for num_components in components:
    print(f"-------------- {num_components} components --------------")
    df_train = df_csi_train.copy()
    df_test = df_csi_test.copy()
    dumps_directory = f'./results/SQ/PCA_Only/{num_components}_components'
    os.makedirs(dumps_directory, exist_ok=True)

    scaled_train = scaler.fit_transform(df_train)
    scaled_test = scaler.transform(df_test)

    #Apply PCA
    scaled_train, pca = analyze_PCA(scaled_train, num_components, directory=dumps_directory, saveGraph=True, plotGraph=True)
    test_reduced = pca.transform(scaled_test)
    scaled_test = pd.DataFrame(test_reduced, columns=[f'PC{i}' for i in range(num_components)])

    #reconstruct
    df_train_reconstructed = reconstruct_data(scaled_train, pca, scaler, csi_subcarriers)
    df_test_reconstructed = reconstruct_data(scaled_test, pca, scaler, csi_subcarriers)
    train_reconstructed = tf.convert_to_tensor(df_train_reconstructed.to_numpy(), dtype=tf.float32)
    test_reconstructed = tf.convert_to_tensor(df_test_reconstructed.to_numpy(), dtype=tf.float32)
    train_data.csi = train_reconstructed
    test_data.csi = test_reconstructed

    #Feature Extraction
    df_train_reconstructed["TimeWindow"] = train_data.time_windows.numpy().flatten()
    df_test_reconstructed["TimeWindow"] = test_data.time_windows.numpy().flatten()
    train_featured = extractWindowedFeatures(df_train_reconstructed, column_indexes=csi_subcarriers)     
    test_featured = extractWindowedFeatures(df_test_reconstructed, column_indexes=csi_subcarriers)

    #Find the best threshold
    train_featured["Label"] = train_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    train_f1, threshold = classify_presence(train_featured, plot_roc=False)   
    
    #Classify test set
    test_featured["Label"] = test_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    test_f1, _ = classify_presence(test_featured, plot_roc=False, thr=threshold)

    results.append({
            'num_components': num_components,
            'threshold': threshold,
            'f1_train': train_f1,
            'f1_test': test_f1,
            'original_f1': og_f1
        })
    
    if (saveCSV):
        df_test_reconstructed.to_csv(os.path.join(dumps_directory, 'presence_reconstructed_test.csv'), index=False)
        test_featured.to_csv(os.path.join(dumps_directory, 'filteredFeaturesLabeledPresence_reconstructed_test.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'accuracy.csv')
os.makedirs(os.path.dirname(results_file), exist_ok=True)
results_df.to_csv(results_file, index=False)

#### Bits

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/SQ/PCA_Only'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)
time_windows = pd.DataFrame(test_data.time_windows.numpy().flatten(), columns=["TimeWindow"])

results = []
og_f1 = load_comparison()

for num_components in components:
    print(f"-------------- {num_components} components --------------")
    df_train = df_csi_train.copy()
    df_test = df_csi_test.copy()
    dumps_directory = f'./results/SQ/PCA_Only/{num_components}_components'
    os.makedirs(dumps_directory, exist_ok=True)

    scaled_train = scaler.fit_transform(df_train)
    scaled_test = scaler.transform(df_test)

    #Apply PCA
    scaled_train, pca = analyze_PCA(scaled_train, num_components, directory=dumps_directory, saveGraph=False, plotGraph=False)
    test_reduced = pca.transform(scaled_test)
    scaled_test = pd.DataFrame(test_reduced, columns=[f'PC{i}' for i in range(num_components)])

    PCA_feature, PCA_frame, PCA_window_feature, PCA_window, PCA_total = bits_needed(scaled_test.round(5), time_windows, verbose=True)

    # Reconstruction
    reconstructed_df = pd.read_csv(os.path.join(dumps_directory, 'presence_reconstructed_test.csv'))

    REC_feature, REC_frame, REC_window_feature, REC_window, REC_total = bits_needed(reconstructed_df, time_windows, verbose=True)

    results.append({
        'num_components': num_components,
        'PCA_feature': PCA_feature,
        'REC_feature': REC_feature,
        'PCA_frame': PCA_frame,
        'REC_frame': REC_frame,
        'PCA_window': PCA_window,
        'REC_window': REC_window,
        'PCA_total': PCA_total,
        'REC_total': REC_total
    })

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'bits.csv')
os.makedirs(os.path.dirname(results_file), exist_ok=True)
results_df.to_csv(results_file, index=False)

#### Graphs

In [164]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]

results_directory = './results_csv/SQ/PCA_Only'
graphs_directory = './results_graphs/SQ'
os.makedirs(graphs_directory, exist_ok=True)

In [None]:
og_f1 = pd.read_csv(os.path.join(results_directory, 'accuracy.csv'))
bit = pd.read_csv(os.path.join(results_directory, 'bits.csv'))
f1_bit = og_f1.merge(bit, on=['num_components'])

max_accuracy = f1_bit['original_f1'].max()
f1_bit = f1_loss(f1_bit, max_accuracy)
f1_bit.to_csv(os.path.join(results_directory, 'results.csv'), index=False)

In [None]:
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['PCA_frame'], f1_bit['f1_loss'], marker='o', linestyle='--')
# [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40]
plt.title('Relative F1-Score loss with PCA applied')
plt.xlabel('Average bits per frame')
plt.ylabel('Relative F1-Score Loss (%)')
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_PCA-Only[BxF].png'))
plt.show()

In [None]:
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['PCA_window'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-Score loss with PCA applied')
plt.xlabel('Average bits per frame')
plt.ylabel('Relative F1-Score Loss (%)')
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_PCA-Only[BxW].png'))
plt.show()

In [None]:
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['ENC_frame'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-Score loss with quantized and encoded data')
plt.xlabel('Average bits per frame')
plt.xticks(np.arange(0, 230, 10))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded_QNT-Only[BxF].png'))
plt.show()

In [None]:
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['ENC_window'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-score loss with quantized and encoded data')
plt.xlabel('Average bits per window')
plt.xticks(np.arange(0, 15500, 500))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded_QNT-Only[BxW].png'))
plt.show()

### Quantization Only

In [None]:
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/SQ/QNT_Only'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)

results = []
og_AUC, og_f1, _, og_precision, og_recall = load_comparison()

df_train = df_csi_train.copy()
df_test = df_csi_test.copy()
dumps_directory = f'./results/SQ/QNT_Only'
os.makedirs(dumps_directory, exist_ok=True)

scaled_train = scaler.fit_transform(df_train)
scaled_test = scaler.transform(df_test)
df_scaled_train = pd.DataFrame(scaled_train, columns=csi_subcarriers)
df_scaled_test = pd.DataFrame(scaled_test, columns=csi_subcarriers)

for num_levels in levels:
    print(f"-------------- {num_levels} lvls --------------")
    sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
    os.makedirs(sub_directory, exist_ok=True)
    filename = f'{num_levels}_lvls'
    
    #Quantize the data
    df_train_quantized, centroids, train_indices = apply_quantization(df_scaled_train, num_levels) #LLoyd-Max quantization
    df_test_quantized, test_indices = apply_existing_quantization(df_scaled_test, centroids)

    # Encode-Decode
    encoded_df, huffman_codes = apply_encoding(df_test_quantized)
    df = apply_decoding(encoded_df, huffman_codes)

    #Reconstruct train data
    df_train_reconstructed = scaler.inverse_transform(df_train_quantized)
    df_test_reconstructed = scaler.inverse_transform(df_test_quantized)
    df_train_reconstructed = pd.DataFrame(df_train_reconstructed, columns=csi_subcarriers)
    df_test_reconstructed = pd.DataFrame(df_test_reconstructed, columns=csi_subcarriers)
    train_reconstructed = tf.convert_to_tensor(df_train_reconstructed.to_numpy(), dtype=tf.float32)
    test_reconstructed = tf.convert_to_tensor(df_test_reconstructed.to_numpy(), dtype=tf.float32)
    train_data.csi = train_reconstructed
    test_data.csi = test_reconstructed

    #Feature Extraction
    df_train_reconstructed["TimeWindow"] = train_data.time_windows.numpy().flatten()
    df_test_reconstructed["TimeWindow"] = test_data.time_windows.numpy().flatten()
    train_featured = extractWindowedFeatures(df_train_reconstructed, column_indexes=csi_subcarriers)     
    test_featured = extractWindowedFeatures(df_test_reconstructed, column_indexes=csi_subcarriers)

    #Find the best threshold
    train_featured["Label"] = train_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    train_AUC, train_f1, train_thr, train_precision, train_recall = classify_presence(train_featured, plot_roc=False)   
    
    #Classify test set
    test_featured["Label"] = test_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    test_AUC, test_f1, _, test_precision, test_recall = classify_presence(test_featured, plot_roc=False, thr=train_thr)

    results.append({
        'num_levels': num_levels,
        'threshold': train_thr,
        'original_f1': og_f1,
        'train_f1': train_f1,
        'test_f1': test_f1,
        'original_AUC': og_AUC,
        'train_AUC': train_AUC,
        'test_AUC': test_AUC,
        'original_precision': og_precision,
        'train_precision': train_precision,
        'test_precision': test_precision,
        'original_recall': og_recall,
        'train_recall': train_recall,
        'test_recall': test_recall
    })
        
    if (saveCSV):
        encoded_df.to_csv(os.path.join(sub_directory, 'encodedQuantizedPCAPresence_test.csv'), index=False)
        df_test_reconstructed.to_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'), index=False)
        test_featured.to_csv(os.path.join(sub_directory, 'filteredFeaturesLabeledPresence_reconstructed_test.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'accuracy.csv')
results_df.to_csv(results_file, index=False)

print("Done")

#### Bits

In [None]:
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/SQ/QNT_Only'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)
time_windows = pd.DataFrame(test_data.time_windows.numpy().flatten(), columns=["TimeWindow"])

results = []
og_AUC, og_f1, _, og_precision, og_recall = load_comparison()

df_train = df_csi_train.copy()
df_test = df_csi_test.copy()
dumps_directory = f'./results/SQ/QNT_Only'
os.makedirs(dumps_directory, exist_ok=True)

scaled_train = scaler.fit_transform(df_train)
scaled_test = scaler.transform(df_test)
df_scaled_train = pd.DataFrame(scaled_train, columns=csi_subcarriers)
df_scaled_test = pd.DataFrame(scaled_test, columns=csi_subcarriers)

for num_levels in levels:
    print(f"-------------- {num_levels} lvls --------------")
    sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
    os.makedirs(sub_directory, exist_ok=True)

    # Lloyd-Max Quantization
    df_train_quantized, centroids, train_indices = apply_quantization(df_scaled_train, num_levels)
    df_test_quantized, test_indices = apply_existing_quantization(df_scaled_test, centroids)

    print("DF_QUANTIZED")
    df_indices = pd.DataFrame(test_indices, columns=csi_subcarriers)
    QT_feature, QT_frame, QT_window_feature, QT_window, QT_total = bits_needed(df_indices, time_windows, num_levels, verbose=True)

    # Encode-Decode
    df_encoded_test, huffman_codes_test = apply_encoding(df_test_quantized)
    entropyENC = compute_entropy(encoded_df, verbose=False)
    print("DF_ENCODED")
    ENC_feature, ENC_frame, ENC_window_feature, ENC_window, ENC_total = bits_needed(df_encoded_test, time_windows, verbose=True)
    
    # Reconstruction
    reconstructed_df = pd.read_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'))
    entropyREC = compute_entropy(reconstructed_df, verbose=False)
    print("DF_RECONSTRUCTED")
    REC_feature, REC_frame, REC_window_feature, REC_window, REC_total = bits_needed(reconstructed_df.iloc[:, :-1], time_windows, verbose=True)

    results.append({
        'num_levels': num_levels,
        'QT_feature': QT_feature,
        'ENC_feature': ENC_feature,
        'REC_feature': REC_feature,
        'QT_frame': QT_frame,
        'ENC_frame': ENC_frame,
        'REC_frame': REC_frame,
        'QT_window': QT_window,
        'ENC_window': ENC_window,
        'REC_window': REC_window,
        'QT_total': QT_total,
        'ENC_total': ENC_total,
        'REC_total': REC_total
    })

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'bits.csv')
results_df.to_csv(results_file, index=False)

#### Graphs

In [134]:
levels = [2**i for i in range(1, 9)]

results_directory = './results_csv/SQ/QNT_Only'
graphs_directory = './results_graphs/SQ/'
os.makedirs(graphs_directory, exist_ok=True)

In [131]:
og_f1 = pd.read_csv(os.path.join(results_directory, 'accuracy.csv'))
bit = pd.read_csv(os.path.join(results_directory, 'bits.csv'))
f1_bit = og_f1.merge(bit, on=['num_levels'])
max_accuracy = f1_bit['original_f1'].max()
f1_bit = f1_loss(f1_bit, max_accuracy)
f1_bit.to_csv(os.path.join(results_directory, 'results.csv'), index=False)

In [None]:
results_directory = './results_csv/SQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['QT_frame'], f1_bit['f1_loss'], marker='o', linestyle='--')
#plt.ylim(0.7)
plt.title('Relative F1-Score loss with quantized data')
plt.xlabel('Bits per frame')
plt.xticks(np.arange(0, 470, 10))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_QNT-Only[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/SQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['QT_window'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-Score loss with quantized')
plt.xlabel('Bits per window')
plt.xticks(np.arange(0, 33000, 1000))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_QNT-Only[BxW].png'))
plt.show()

In [None]:
results_directory = './results_csv/SQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))
plt.figure(figsize=(20, 10))
plt.plot(f1_bit['ENC_frame'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-Score loss with quantized and encoded data')
plt.xlabel('Average bits per frame')
plt.xticks(np.arange(0, 230, 10))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded_QNT-Only[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/SQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['ENC_window'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-score loss with quantized and encoded data')
plt.xlabel('Average bits per window')
plt.xticks(np.arange(0, 15500, 500))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded_QNT-Only[BxW].png'))
plt.show()

## Vector Quantization

### Comprehensive Run

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/VQ'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)

results = []
og_AUC, og_f1, _, og_precision, og_recall = load_comparison()

for num_components in components:
    print(f"-------------- {num_components} components --------------")
    df_train = df_csi_train.copy()
    df_test = df_csi_test.copy()
    dumps_directory = f'./results/VQ/{num_components}_components'
    os.makedirs(dumps_directory, exist_ok=True)

    scaled_train = scaler.fit_transform(df_train)
    scaled_test = scaler.transform(df_test)

    #Apply PCA
    scaled_train, pca = analyze_PCA(scaled_train, num_components, directory=dumps_directory, saveGraph=True, plotGraph=True)
    test_reduced = pca.transform(scaled_test)
    scaled_test = pd.DataFrame(test_reduced, columns=[f'PC{i}' for i in range(num_components)])

    for num_levels in levels:
        print(f"-------------- {num_components} components w/ {num_levels} lvls --------------")
        sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
        os.makedirs(sub_directory, exist_ok=True)
        filename = f'{num_levels}_lvls'
        
        #Quantize the data
        train_quantized, codebook = apply_vector_quantization(scaled_train, num_levels) #LLoyd-Max quantization
        test_quantized = apply_existing_vector_quantization(scaled_test, codebook)

        train_quantized = get_quantized_data(train_quantized, codebook)
        test_quantized = get_quantized_data(test_quantized, codebook)

        df_train_quantized = pd.DataFrame(train_quantized, columns=[f'PC{i}' for i in range(num_components)])
        df_test_quantized = pd.DataFrame(test_quantized, columns=[f'PC{i}' for i in range(num_components)])

        # Encode-Decode
        encoded_df, huffman_codes = apply_encoding(df_test_quantized)
        df = apply_decoding(encoded_df, huffman_codes)

        #Reconstruct train data
        df_train_reconstructed = reconstruct_data(df_train_quantized, pca, scaler, csi_subcarriers)
        df_test_reconstructed = reconstruct_data(df_test_quantized, pca, scaler, csi_subcarriers)
        train_reconstructed = tf.convert_to_tensor(df_train_reconstructed.to_numpy(), dtype=tf.float32)
        test_reconstructed = tf.convert_to_tensor(df_test_reconstructed.to_numpy(), dtype=tf.float32)
        train_data.csi = train_reconstructed
        test_data.csi = test_reconstructed

        #Feature Extraction
        df_train_reconstructed["TimeWindow"] = train_data.time_windows.numpy().flatten()
        df_test_reconstructed["TimeWindow"] = test_data.time_windows.numpy().flatten()
        train_featured = extractWindowedFeatures(df_train_reconstructed, column_indexes=csi_subcarriers)     
        test_featured = extractWindowedFeatures(df_test_reconstructed, column_indexes=csi_subcarriers)

        #Find the best threshold
        train_featured["Label"] = train_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
        print("Train")
        train_AUC, train_f1, train_thr, train_precision, train_recall = classify_presence(train_featured, plot_roc=False)
        
        #Classify test set
        test_featured["Label"] = test_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
        print("Test")
        test_AUC, test_f1, _, test_precision, test_recall = classify_presence(test_featured, plot_roc=False, thr=train_thr)

        results.append({
            'num_components': num_components,
            'num_levels': num_levels,
            'threshold': train_thr,
            'original_f1': og_f1,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'original_AUC': og_AUC,
            'train_AUC': train_AUC,
            'test_AUC': test_AUC,
            'original_precision': og_precision,
            'train_precision': train_precision,
            'test_precision': test_precision,
            'original_recall': og_recall,
            'train_recall': train_recall,
            'test_recall': test_recall
        })
         
        if (saveCSV):
            encoded_df.to_csv(os.path.join(sub_directory, 'encodedQuantizedPCAPresence_test.csv'), index=False)
            df_test_reconstructed.to_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'), index=False)
            test_featured.to_csv(os.path.join(sub_directory, 'filteredFeaturesLabeledPresence_reconstructed_test.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'accuracy.csv')
results_df.to_csv(results_file, index=False)

print("Done")

#### Bits

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/VQ'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)
time_windows = pd.DataFrame(test_data.time_windows.numpy().flatten(), columns=["TimeWindow"])

results = []
og_AUC, og_f1, _, og_precision, og_recall = load_comparison()

for num_components in components:
    print(f"-------------- {num_components} components --------------")
    df_train = df_csi_train.copy()
    df_test = df_csi_test.copy()
    dumps_directory = f'./results/VQ/{num_components}_components'
    os.makedirs(dumps_directory, exist_ok=True)

    scaled_train = scaler.fit_transform(df_train)
    scaled_test = scaler.transform(df_test)

    #Apply PCA
    scaled_train, pca = analyze_PCA(scaled_train, num_components, directory=dumps_directory, saveGraph=True, plotGraph=True)
    test_reduced = pca.transform(scaled_test)
    scaled_test = pd.DataFrame(test_reduced, columns=[f'PC{i}' for i in range(num_components)])
    
    PCA_feature, PCA_frame, PCA_window_feature, PCA_window, PCA_total = bits_needed(scaled_test, time_windows, verbose=True)

    for num_levels in levels:
        sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
        os.makedirs(sub_directory, exist_ok=True)

        # Lloyd-Max Quantization
        train_quantized, codebook = apply_vector_quantization(scaled_train, num_levels) #Vector quantization
        test_quantized = apply_existing_vector_quantization(scaled_test, codebook)

        print("DF_QUANTIZED")
        df_test_quantized = pd.DataFrame(test_quantized, columns=['centroid_num'])
        QT_feature, QT_frame, QT_window_feature, QT_window, QT_total = bits_needed(df_test_quantized, time_windows, num_levels, verbose=True)

        # Encode-Decode
        df_encoded_test, huffman_codes_test = apply_encoding(df_test_quantized)
        entropyENC = compute_entropy(encoded_df, verbose=False)
        print("DF_ENCODED")
        ENC_feature, ENC_frame, ENC_window_feature, ENC_window, ENC_total = bits_needed(df_encoded_test, time_windows, verbose=True)
        
        # Reconstruction
        reconstructed_df = pd.read_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'))
        entropyREC = compute_entropy(reconstructed_df, verbose=False)
        print("DF_RECONSTRUCTED")
        REC_feature, REC_frame, REC_window_feature, REC_window, REC_total = bits_needed(reconstructed_df.iloc[:, :-1], time_windows, verbose=True)

        results.append({
            'num_components': num_components,
            'num_levels': num_levels,
            'PCA_feature': PCA_feature,
            'QT_feature': QT_feature,
            'ENC_feature': ENC_feature,
            'REC_feature': REC_feature,
            'PCA_frame': PCA_frame,
            'QT_frame': QT_frame,
            'ENC_frame': ENC_frame,
            'REC_frame': REC_frame,
            'QT_window': QT_window,
            'PCA_window': PCA_window,
            'ENC_window': ENC_window,
            'REC_window': REC_window,
            'PCA_total': PCA_total,
            'QT_total': QT_total,
            'ENC_total': ENC_total,
            'REC_total': REC_total
        })

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'bits.csv')
results_df.to_csv(results_file, index=False)

#### Graphs

In [None]:
components = [i for i in range(1, 10)] + [i for i in range (10, 20, 2)] + [i for i in range(20, 35, 5)] + [40]
levels = [2**i for i in range(1, 9)]

results_directory = './results_csv/VQ'
graphs_directory = './results_graphs/VQ'
os.makedirs(graphs_directory, exist_ok=True)

In [None]:
og_f1 = pd.read_csv(os.path.join(results_directory, 'accuracy.csv'))
bit = pd.read_csv(os.path.join(results_directory, 'bits.csv'))
f1_bit = og_f1.merge(bit, on=['num_components', 'num_levels'])
max_accuracy = f1_bit['original_f1'].max()
f1_bit = f1_loss(f1_bit, max_accuracy)
f1_bit.to_csv(os.path.join(results_directory, 'results.csv'), index=False)

In [None]:
results_directory = './results_csv/VQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['QT_frame'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
plt.title('Relative F1-Score loss with quantized data and PCA applied')
plt.xlabel('Bits per frame')
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/VQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['QT_window'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
plt.title('Relative F1-Score loss with quantized data and PCA applied')
plt.xlabel('Bits per window')
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit[BxW].png'))
plt.show()

In [None]:
results_directory = './results_csv/VQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['ENC_frame'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')

plt.title('Relative F1-Score loss with quantized and encoded data and PCA applied')
plt.xlabel('Average bits per frame')
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/VQ'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

#components = [1,2,3,4,5,8,10,12,14,16,18]

plt.figure(figsize=(20, 10))
for num_components in f1_bit['num_components'].unique():
    if(num_components not in components): continue
    target_data = f1_bit[f1_bit['num_components'] == num_components]
    plt.plot(target_data['ENC_window'], target_data['f1_loss'], marker='o', linestyle='--', label=f'{num_components} components')
#plt.ylim(0.7)
plt.title('Relative F1-score loss with quantized and encoded data and PCA applied')
plt.xlabel('Average bits per window')
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded[BxW].png'))
plt.show()

### Quantization Only

In [None]:
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/VQ/QNT_Only'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)

results = []
og_AUC, og_f1, _, og_precision, og_recall = load_comparison()

df_train = df_csi_train.copy()
df_test = df_csi_test.copy()
dumps_directory = f'./results/VQ/QNT_Only'
os.makedirs(dumps_directory, exist_ok=True)

scaled_train = scaler.fit_transform(df_train)
scaled_test = scaler.transform(df_test)
df_scaled_train = pd.DataFrame(scaled_train, columns=csi_subcarriers)
df_scaled_test = pd.DataFrame(scaled_test, columns=csi_subcarriers)

for num_levels in levels:
    print(f"-------------- {num_levels} lvls --------------")
    sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
    os.makedirs(sub_directory, exist_ok=True)
    filename = f'{num_levels}_lvls'
    
    #Quantize the data
    train_quantized, codebook = apply_vector_quantization(scaled_train, num_levels) #LLoyd-Max quantization
    test_quantized = apply_existing_vector_quantization(scaled_test, codebook)

    train_quantized = get_quantized_data(train_quantized, codebook)
    test_quantized = get_quantized_data(test_quantized, codebook)

    df_train_quantized = pd.DataFrame(train_quantized, columns=csi_subcarriers)
    df_test_quantized = pd.DataFrame(test_quantized, columns=csi_subcarriers)

    # Encode-Decode
    encoded_df, huffman_codes = apply_encoding(df_test_quantized)
    df = apply_decoding(encoded_df, huffman_codes)

    #Reconstruct train data
    df_train_reconstructed = scaler.inverse_transform(df_train_quantized)
    df_test_reconstructed = scaler.inverse_transform(df_test_quantized)
    df_train_reconstructed = pd.DataFrame(df_train_reconstructed, columns=csi_subcarriers)
    df_test_reconstructed = pd.DataFrame(df_test_reconstructed, columns=csi_subcarriers)
    train_reconstructed = tf.convert_to_tensor(df_train_reconstructed.to_numpy(), dtype=tf.float32)
    test_reconstructed = tf.convert_to_tensor(df_test_reconstructed.to_numpy(), dtype=tf.float32)
    train_data.csi = train_reconstructed
    test_data.csi = test_reconstructed

    #Feature Extraction
    df_train_reconstructed["TimeWindow"] = train_data.time_windows.numpy().flatten()
    df_test_reconstructed["TimeWindow"] = test_data.time_windows.numpy().flatten()
    train_featured = extractWindowedFeatures(df_train_reconstructed, column_indexes=csi_subcarriers)     
    test_featured = extractWindowedFeatures(df_test_reconstructed, column_indexes=csi_subcarriers)

    #Find the best threshold
    train_featured["Label"] = train_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    train_AUC, train_f1, train_thr, train_precision, train_recall = classify_presence(train_featured, plot_roc=False)   
    
    #Classify test set
    test_featured["Label"] = test_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    test_AUC, test_f1, _, test_precision, test_recall = classify_presence(test_featured, plot_roc=False, thr=train_thr)

    results.append({
        'num_levels': num_levels,
        'threshold': train_thr,
        'original_f1': og_f1,
        'train_f1': train_f1,
        'test_f1': test_f1,
        'original_AUC': og_AUC,
        'train_AUC': train_AUC,
        'test_AUC': test_AUC,
        'original_precision': og_precision,
        'train_precision': train_precision,
        'test_precision': test_precision,
        'original_recall': og_recall,
        'train_recall': train_recall,
        'test_recall': test_recall
    })
        
    if (saveCSV):
        encoded_df.to_csv(os.path.join(sub_directory, 'encodedQuantizedPCAPresence_test.csv'), index=False)
        df_test_reconstructed.to_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'), index=False)
        test_featured.to_csv(os.path.join(sub_directory, 'filteredFeaturesLabeledPresence_reconstructed_test.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'accuracy.csv')
results_df.to_csv(results_file, index=False)

print("Done")

#### Bits

In [None]:
levels = [2**i for i in range(1, 9)]

time_window = 3 #seconds
window_size = 64 # for 3 second
step_size = window_size // 2

data = pd.read_csv("../../datasets/filteredPresence.csv")
data = data_preprocessing(data, method)
csi_subcarriers = data.columns[1:]
num_features = len(csi_subcarriers)

train_data, test_data = data_windowing(data, time_window, window_size, step_size, verbose=False)

In [None]:
results_directory = './results_csv/VQ/QNT_Only'
os.makedirs(results_directory, exist_ok=True)
df_csi_train = pd.DataFrame(train_data.csi.numpy(), columns=csi_subcarriers)
df_csi_test = pd.DataFrame(test_data.csi.numpy(), columns=csi_subcarriers)
time_windows = pd.DataFrame(test_data.time_windows.numpy().flatten(), columns=["TimeWindow"])

results = []
og_AUC, og_f1, _, og_precision, og_recall = load_comparison()

df_train = df_csi_train.copy()
df_test = df_csi_test.copy()
dumps_directory = f'./results/VQ/QNT_Only'
os.makedirs(dumps_directory, exist_ok=True)

scaled_train = scaler.fit_transform(df_train)
scaled_test = scaler.transform(df_test)
df_scaled_train = pd.DataFrame(scaled_train, columns=csi_subcarriers)
df_scaled_test = pd.DataFrame(scaled_test, columns=csi_subcarriers)

for num_levels in levels:
    print(f"-------------- {num_levels} lvls --------------")
    sub_directory = os.path.join(dumps_directory, f'lvls_{num_levels}')
    os.makedirs(sub_directory, exist_ok=True)

    #Quantize the data
    train_quantized, codebook = apply_vector_quantization(scaled_train, num_levels) #LLoyd-Max quantization
    test_quantized = apply_existing_vector_quantization(scaled_test, codebook)

    print("DF_QUANTIZED")
    df_test_quantized = pd.DataFrame(test_quantized, columns=['centroid_num'])
    QT_feature, QT_frame, QT_window_feature, QT_window, QT_total = bits_needed(df_test_quantized, time_windows, num_levels, verbose=True)
    
    # Encode-Decode
    df_encoded_test, huffman_codes_test = apply_encoding(df_test_quantized)
    entropyENC = compute_entropy(encoded_df, verbose=False)
    print("DF_ENCODED")
    ENC_feature, ENC_frame, ENC_window_feature, ENC_window, ENC_total = bits_needed(df_encoded_test, time_windows, verbose=True)
    
    # Reconstruction
    reconstructed_df = pd.read_csv(os.path.join(sub_directory, 'presence_reconstructed_test.csv'))
    entropyREC = compute_entropy(reconstructed_df, verbose=False)
    print("DF_RECONSTRUCTED")
    REC_feature, REC_frame, REC_window_feature, REC_window, REC_total = bits_needed(reconstructed_df.iloc[:, :-1], time_windows, verbose=True)

    results.append({
        'num_levels': num_levels,
        'QT_feature': QT_feature,
        'ENC_feature': ENC_feature,
        'REC_feature': REC_feature,
        'QT_frame': QT_frame,
        'ENC_frame': ENC_frame,
        'REC_frame': REC_frame,
        'QT_window': QT_window,
        'ENC_window': ENC_window,
        'REC_window': REC_window,
        'QT_total': QT_total,
        'ENC_total': ENC_total,
        'REC_total': REC_total
    })

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(results_directory, 'bits.csv')
results_df.to_csv(results_file, index=False)

#### Graphs

In [195]:
levels = [2**i for i in range(1, 9)]

results_directory = './results_csv/VQ/QNT_Only'
graphs_directory = './results_graphs/VQ/'
os.makedirs(graphs_directory, exist_ok=True)

In [196]:
og_f1 = pd.read_csv(os.path.join(results_directory, 'accuracy.csv'))
bit = pd.read_csv(os.path.join(results_directory, 'bits.csv'))
f1_bit = og_f1.merge(bit, on=['num_levels'])
max_accuracy = f1_bit['original_f1'].max()
f1_bit = f1_loss(f1_bit, max_accuracy)
f1_bit.to_csv(os.path.join(results_directory, 'results.csv'), index=False)

In [None]:
results_directory = './results_csv/VQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['QT_frame'], f1_bit['f1_loss'], marker='o', linestyle='--')
#plt.ylim(0.7)
plt.title('Relative F1-Score loss with quantized data')
plt.xlabel('Bits per frame')
plt.xticks(np.arange(0, 10, 1))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_QNT-Only[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/VQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['QT_window'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-Score loss with quantized')
plt.xlabel('Bits per window')
plt.xticks(np.arange(0, 600, 50))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_QNT-Only[BxW].png'))
plt.show()

In [None]:
results_directory = './results_csv/VQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))
plt.figure(figsize=(20, 10))
plt.plot(f1_bit['ENC_frame'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-Score loss with quantized and encoded data')
plt.xlabel('Average bits per frame')
plt.xticks(np.arange(0, 10, 1))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded_QNT-Only[BxF].png'))
plt.show()

In [None]:
results_directory = './results_csv/VQ/QNT_Only'
f1_bit = pd.read_csv(os.path.join(results_directory, 'results.csv'))

plt.figure(figsize=(20, 10))
plt.plot(f1_bit['ENC_window'], f1_bit['f1_loss'], marker='o', linestyle='--')
plt.title('Relative F1-score loss with quantized and encoded data')
plt.xlabel('Average bits per window')
plt.xticks(np.arange(0, 400, 50))
plt.ylabel('Relative F1-Score Loss (%)')
plt.yticks(np.arange(0, 25, 5))
plt.legend()
plt.grid()
plt.savefig(os.path.join(graphs_directory, 'accuracy-bit_Encoded_QNT-Only[BxW].png'))
plt.show()