# VAE training and pre-processing

In [43]:
import os
import math
import string
import pickle
import scipy.io as sio
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import heapq
import cmath
import warnings
import tensorflow as tf
import tf_keras
import keras
import tensorflow.keras.backend as K

from enum import Enum
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from collections import Counter

from scipy.stats import dirichlet
from scipy.cluster import vq


os.environ["TF_USE_LEGACY_KERAS"]= '1' # Use legacy keras for compatibility
warnings.filterwarnings("ignore")

random_state = 42
random.seed(random_state)
np.random.seed(random_state) # predictable random numbers, for demonstration only
tf.random.set_seed(random_state) # reproducibility
os.environ['TF_DETERMINISTIC_OPS'] = '1' # make operations deterministic
os.environ['PYTHONHASHSEED'] = str(random_state) # reproducibility

In [44]:
class Compression_Method(Enum):
    XY = 1               #applies PCA on X and Y then filters (1)
    AmpPhase = 2         #applies PCA on Amplitude and Phase then filters (2)
    AmpPhaseFiltered = 3 #applies PCA on Amplitude and Phase after filtering (3)

#Modify this to change the approach used: XY, AmpPhase, AmpPhaseFiltered
method = Compression_Method.AmpPhase
scaler = StandardScaler()
ignorePhases = True
saveCSV = True

latent_dim = 1 
num_activities = 2 #present or absent

folder_name = f'datasets/vae_weights'
base_directory = 'results'
os.makedirs(folder_name, exist_ok=True)
os.makedirs(base_directory, exist_ok=True)

## Variational Auto-Encoder

In [45]:
class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf_keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
    
def create_csi_encoder(input_shape, latent_dim):
    encoder_inputs = tf_keras.Input(shape=input_shape)
    x = tf_keras.layers.Conv2D(32, (3, 3), activation='relu', strides=(3, 2), padding='same')(encoder_inputs)
    x = tf_keras.layers.Conv2D(64, (3, 3), activation='relu', strides=(2, 2), padding='same')(x)
    x = tf_keras.layers.Conv2D(128, (3, 3), activation='relu', strides=(1, 2), padding='same')(x)
    x = tf_keras.layers.Flatten()(x)
    x = tf_keras.layers.Dense(32, activation='relu')(x)

    z_mean = tf_keras.layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = tf_keras.layers.Dense(latent_dim, name='z_log_var')(x)
    z = Sampling()([z_mean, z_log_var])

    return tf_keras.Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')


def create_csi_decoder(input_shape, latent_dim, out_filter):
    decoder_inputs = tf_keras.Input(shape=(latent_dim,))
    x = tf_keras.layers.Dense(math.prod(input_shape), activation='relu')(decoder_inputs)
    x = tf_keras.layers.Reshape(input_shape)(x)
    x = tf_keras.layers.Conv2DTranspose(128, (3, 3), activation='relu', strides=(3, 2), padding='same')(x)
    x = tf_keras.layers.Conv2DTranspose(64, (3, 3), activation='relu', strides=(2, 2), padding='same')(x)
    x = tf_keras.layers.Conv2DTranspose(32, (3, 3), activation='relu', strides=(1, 2), padding='same')(x)
    decoder_outputs = tf_keras.layers.Conv2DTranspose(out_filter, (3, 3), activation='sigmoid', padding='same')(x)

    return tf_keras.Model(decoder_inputs, decoder_outputs, name='decoder')

In [46]:
class VAE(tf_keras.Model):
    def __init__(self, enc_input_shape=(6, 56, 1), dec_input_shape=(1, 7, 128), latent_dim=1, **kwargs):
        super().__init__(**kwargs)
        self.encoder = create_csi_encoder(enc_input_shape, latent_dim)
        self.decoder = create_csi_decoder(dec_input_shape, latent_dim, enc_input_shape[-1])
        self.total_loss_tracker = tf_keras.metrics.Mean(name='total_loss')
        self.reconstruction_loss_tracker = tf_keras.metrics.Mean(name='reconstruction_loss')
        self.kl_loss_tracker = tf_keras.metrics.Mean(name='kl_loss')

        self.encoder.summary()
        self.decoder.summary()

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data[0])
            reconstruction = self.decoder(z)

            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    tf_keras.losses.binary_crossentropy(data[0], reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)

        return {
            'loss': self.total_loss_tracker.result(),
            'reconstruction_loss': self.reconstruction_loss_tracker.result(),
            'kl_loss': self.kl_loss_tracker.result(),
        }

    def call(self, inputs, training=None, mask=None):
        pass


In [47]:
def apply_vae_encoder(vae, source):
    #Use the VAE to process CSI data
    z_data = np.zeros([0, 2])
    z_labels = np.zeros([0])

    for (data, labels) in source:
        labels = tf.squeeze(labels)
        z_mean, z_log_var, _ = vae.encoder.predict(data, verbose=0)
        z_tmp = np.concatenate([z_mean, z_log_var], axis=1)
        z_data = np.concatenate([z_data, z_tmp], axis=0)
        z_labels = np.concatenate([z_labels, labels.numpy().ravel()], axis=0)
        
    return z_data, z_labels

In [48]:
checkpoint_path = f'./{folder_name}/' + 'cp-{epoch:04d}.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)
checkpoint_cb = tf_keras.callbacks.ModelCheckpoint(checkpoint_path, verbose=1, save_weights_only=True)
early_stopping_cb = tf_keras.callbacks.EarlyStopping(monitor='loss', patience=3)
csv_logger_cb = tf_keras.callbacks.CSVLogger(f'./{folder_name}/model_history_log.csv', append=True)

## Variables and Functions definitions

In [49]:
notInterestedIndexes = list(range(-32,-28)) + list(range(0,1)) + list(range(29,32)) #null columns in the dataset
interestedIndexes = list(range(-28,0)) + list(range(1,29)) #non null columns in the dataset

w1=5 #for filtering
w2=3 #for windows
#w2=1 #1 second per window
lambda1=3 #threshold

#build ground truth
t2 = 1205
lb1 = [120,360,600,900]
ub1 = [240,480,720,1080]
lb2 = [t2+l for l in [180,540,990,1500]]
ub2 = [t2+u for u in [360,750,1170,1590]]

lower_bounds = lb1+lb2
upper_bounds = ub1+ub2

In [50]:
def getGT(timestamp,lower_bounds,upper_bounds):
    # if I'm in the room in one case, or I'm crossing the entrance put 1
    for i in range (0, len(lower_bounds)):
        if (timestamp >= lower_bounds[i]) & (timestamp <= upper_bounds[i]):
            return 1
    return 0

def classify_presence(df,ycol="MuStdAmplPaper",gt="Label",plot_roc=False,num_iter=1000):
    # Y are the labels that indicate if i'm passing or not
    Y = df[gt]
    # thr is the threshold: if amplitude > thr, then assign to Y_pred 1 (presence), otherwise 0. Every time update the threshold
    thr= df[ycol].min()
    tpr = []
    fpr= []
    thr_list= []
    step = (df[ycol].max() - df[ycol].min()) / num_iter
    while thr <= df[ycol].max():
        # compute the predictions
        Y_pred = df.apply(lambda row: 1 if row[ycol] >= thr else 0, axis=1)
        tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()
        # compute True Positive Rate and False Positive rate to plot the roc curve
        tpr.append(tp/(tp+fn))
        fpr.append(fp/(fp+tn))
        thr_list.append(thr)
        thr += step
    
    if plot_roc:
        plt.figure(figsize=(3,3),dpi=220)
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], color = 'green')
        plt.xlim(-0.05, 1.05)
        plt.ylim(-0.05, 1.05)
        plt.grid()
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC curve")
        plt.show()
    auc_score = auc(fpr,tpr)    
    
    return auc_score

def extractWindowedFeatures(data, column_indexes = [], w2=3):
    data["TimeWindow"] = np.floor(data["Timestamp"] / w2)*w2
    #vertical mean/std
    dataStd = data.groupby(by="TimeWindow").std().drop(["Timestamp","Frame_num"],axis=1)
    #dataMean = data.groupby(by="TimeWindow").mean().drop(["Timestamp","Frame_num"],axis=1)
    
    featuredDf = pd.DataFrame()
    featuredDf["Time"] = data["TimeWindow"].unique()
    #horizontal
    featuredDf["MuStdAmplPaper"] = dataStd[[j for j in column_indexes if j.startswith('Ampl')]].mean(axis=1).reset_index(drop=True) #Axis=1: mean over different columns -> into one col
    return featuredDf

#removes outliers from the data
def filterData(df,w1=3,lambda1=3):
    data = df.copy()
    col_list = [j for j in data.columns if "Ampl" in j]

    for index, row in data.iterrows():
        if index == 0:
            prev_row = row
            continue
        if (index%10000 == 0): print(index)
        subDf = data.loc[(data["Timestamp"]<=row['Timestamp']) & (data["Timestamp"]> row['Timestamp'] - w1),col_list]
        means = subDf.mean(axis=0)
        stds = subDf.std(axis=0)

        for c in col_list: 
            if (abs(row[c] - means[c]) / stds[c]) > lambda1:
                data.at[index,c] = prev_row[c]
                #row[c] = prev_row[c]

        prev_row = row
    return data

def filterData2(df, w1=3, lambda1=3):
    data = df.copy()
    col_list = [j for j in data.columns if "Ampl" in j]
    
    # Rolling window to calculate means and std deviations, shifted to exclude the current row
    rolling_means = data[col_list].rolling(window=w1, min_periods=1).mean().shift(1)
    rolling_stds = data[col_list].rolling(window=w1, min_periods=1).std().shift(1)

    # Start at second row since the first row is skipped in original logic
    for index in range(1, len(data)):
        if index % 10000 == 0: 
            print(index)
        
        for c in col_list:
            current_val = data.at[index, c]
            mean_val = rolling_means.at[index, c]
            std_val = rolling_stds.at[index, c]
            
            # Avoid division by zero
            if pd.isna(std_val) or std_val == 0:
                continue
            
            # Check if the current value is out of bounds
            if abs(current_val - mean_val) / std_val > lambda1:
                # Set value to previous row's value if condition is met
                data.at[index, c] = data.at[index-1, c]
                rolling_means.at[index, c] = data.at[index, c] # Update mean with corrected value
                
    return data

def complex_real(complex_value):
    return complex(complex_value).real

def complex_imag(complex_value):
    return complex(complex_value).imag

def complex_rebuild(real,imag):
    return (real + 1j*imag)

#Function to get top N features for each principal component
def get_top_n_features(loadings_df, n):
    top_features = {}
    for pc in loadings_df.columns:
        top_features[pc] = loadings_df[pc].abs().sort_values(ascending=False).head(n).index.tolist()
    return top_features

In [51]:
class Node: 
    def __init__(self, value=None, frequency=0, left=None, right=None):
        self.value = value
        self.frequency = frequency
        self.left = left
        self.right = right

    def __lt__(self, other): #redefined "less than" operator for heapq
        return self.frequency < other.frequency

def build_tree(data):
    heap = [Node(value, frequency) for value, frequency in data.items()]  #Init heap
    heapq.heapify(heap)

    while len(heap) > 1:  #pop two smallest nodes, merge them and push the merged node back
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        merged = Node(frequency=left.frequency + right.frequency, left=left, right=right)
        heapq.heappush(heap, merged) 

    return heap[0] #root

def generate_codes(node, code="", huffman_codes=None):
    if huffman_codes is None: 
        huffman_codes = {}

    if node.value is not None: #leaf node case
        huffman_codes[node.value] = code
        return
    else:
        generate_codes(node.left, code + "0", huffman_codes)
        generate_codes(node.right, code + "1", huffman_codes)
    return huffman_codes

def encode_huffman(data, huffman_codes):
    emptyStr = ""
    return emptyStr.join([huffman_codes[val] for val in data]) 

def decode_huffman(encoded_data, huffman_codes):
    decoded_data = []
    code = ""
    for bit in encoded_data: #traverse the encoded data and searches for the code
        code += bit
        for key, value in huffman_codes.items():
            if value == code: #If found, append the corresponding value to the decoded data, otherwise add another bit to the code
                decoded_data.append(key)
                code = ""
                break
                
    return decoded_data

def apply_huffman_encode_per_feature(data):
    encoded_df = pd.DataFrame()
    huffman_codes = {}

    for col in data.columns:
        freq_per_data = Counter(data[col]) 
        root = build_tree(freq_per_data)
        code = generate_codes(root)
        #print("data["+ col +"]:\n", data[col])
        encoded_df[col] = data[col].apply(lambda x: encode_huffman([x], code))
        huffman_codes[col] = code
    return encoded_df, huffman_codes

def apply_huffman_decode_per_feature(encoded_data, huffman_codes):
    decoded_df = pd.DataFrame()

    for col in encoded_data.columns:
        decoded_df[col] = decode_huffman(''.join(encoded_data[col]), huffman_codes[col])
    return decoded_df

In [52]:
def bits_needed(source, df, num_lvls=-1, verbose=True):
    data = source.copy()
    data["TimeWindow"] = df["TimeWindow"]
    bits_needed_unique = {}
    bits_needed_window = {}
    avg_bits_needed = {}
    total_bits_needed_dataset = 0
    
    for window in data["TimeWindow"].unique():
        data_window = data[data["TimeWindow"] == window].drop("TimeWindow", axis=1)        
        for col in data_window.columns:
            num_symbols = len(data_window[col].unique())
            if num_lvls > 0:
                bits_needed_unique[col] = np.ceil(np.log2(num_lvls)).astype(int) 
            else:
                bits_needed_unique[col] = np.ceil(np.log2(num_symbols)).astype(int)  # Number of bits to represent each symbol
            #print(f"Column: {col}, Bits needed: {bits_needed[col]} bits")
            
        avg_bits_needed[window] = np.mean(list(bits_needed_unique.values())).round(2)
        bits_needed_window[window] = sum(bits_needed_unique.values())
        total_bits_needed_dataset += sum(bits_needed_unique.values())

    bits_needed = np.mean(list(avg_bits_needed.values())).round(2)
    bits_needed_window = np.mean(list(bits_needed_window.values())).round(2)

    if verbose:
        print(f"\nGlobal metrics:")
        print(f"Average bits: {bits_needed:.2f} bits")
        print(f"Average bits per window: {bits_needed_window:.2f} bits")
        print(f"Bits for the whole dataset: {total_bits_needed_dataset:.2f} bits")

    return bits_needed, bits_needed_window, total_bits_needed_dataset

## Data Processing

### CSI Data Creation

In [53]:
class CsiData(tf_keras.utils.Sequence):
    def __init__(self, csi, labels, indices, batch_size=7, window_size=14):
        self.csi = csi
        self.labels = labels
        self.indices = indices
        self.batch_size = batch_size
        self.window_size = window_size
        self.antennas = 1

    def __len__(self):
        return int(np.ceil(self.indices.shape[-1] / float(self.batch_size)))

    def __getitem__(self, batch_idx):
        first_idx = batch_idx * self.batch_size
        last_idx = (batch_idx + 1) * self.batch_size

        #print(f'first_idx: {first_idx}, last_idx: {last_idx}')
        
        data_batch = [self.csi[x:x + self.window_size, ...] for x in range(first_idx, last_idx)]
        labels_batch = np.transpose([self.labels[first_idx:last_idx]])

        data_batch = tf.convert_to_tensor(data_batch)
        labels_batch = tf.convert_to_tensor(labels_batch)

        if self.antennas == 1:
            data_batch = tf.expand_dims(data_batch, 3)
            labels_batch = tf.expand_dims(labels_batch, 2)

        return data_batch, labels_batch

### PCA Compression

In [54]:
def data_preprocessing(df, method):
    df['Timestamp'] = round(df['Timestamp'], 4)
    data = df.copy()
    
    columns_to_drop = (['Frame_num', 'Source_address', 'TimeWindow'] + 
                    [f"Phase{i}" for i in notInterestedIndexes] + 
                    [f"Ampl{i}" for i in notInterestedIndexes] + 
                    [f"CSI{i}" for i in notInterestedIndexes])
    data.drop(columns=columns_to_drop, inplace=True)

    if ignorePhases:
        data.drop(columns=[col for col in data.columns if col.startswith('Phase')], inplace=True); #Removes Phase columns

    if method == Compression_Method.XY:  
        for j in interestedIndexes:
            data[f'X{j}'] = data[f"CSI{j}"].apply(complex_real)
            data[f'Y{j}'] = data[f"CSI{j}"].apply(complex_imag)
        data.drop(columns=[col for col in data.columns if col.startswith(('Ampl', 'Phase'))], inplace=True); #Removes Ampl and Phase columns
    elif method == Compression_Method.AmpPhaseFiltered:
        data = filterData(data)

    data.drop(columns=[col for col in data.columns if col.startswith('CSI')], inplace=True); #Removes CSI columns
    #data.set_index('Timestamp', inplace=True)
    print("Number of features:", len(data.columns))
    
    return data

Check how many components are needed to have an explanation of 95% of the variance

In [55]:
def find_n_components(data, target, directory):
    #Fit and transform the data
    scaled_data = scaler.fit_transform(data)

    #Apply PCA
    pca = PCA()
    pca.fit(scaled_data)

    var_cumulative = np.cumsum(pca.explained_variance_ratio_)*100

    #finds PCs that explain 95% of the variance
    k = np.argmax(var_cumulative > target) + 1
    print(f"Number of components explaining {target}% variance: "+ str(k))

    plt.figure(figsize=(10, 5))
    plt.title('Cumulative Explained Variance explained by the components')
    plt.ylabel('Cumulative Explained variance')
    plt.xlabel('Principal components')
    plt.axvline(x=k, color="r", linestyle="--")
    plt.axhline(y=target, color="r", linestyle="--")
    plt.plot(range(1, pca.n_components_ + 1), var_cumulative, marker='o', linestyle='--')
    plt.grid()
    if (saveCSV): plt.savefig(os.path.join(directory, 'var_cumulative_x_component.png'))
    plt.show()

    return scaled_data, k

Apply PCA, check the explained variance ratio and the cumulative explained variance ratio

In [56]:
def analyze_PCA(scaled_data, n_components, directory):
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(scaled_data)

    reduced_df = pd.DataFrame(data=reduced_data, columns=[f'PC{i}' for i in range(n_components)])

    #Explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_
    print("Explained variance ratio:", explained_variance_ratio)

    #Cumulative explained variance
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)
    print("Final Cumulative Explained Variance:", cumulative_explained_variance[-1])

    plt.figure(figsize=(10, 5))
    plt.plot(range(1, n_components + 1), cumulative_explained_variance, marker='o', linestyle='--')
    plt.title('Cumulative Explained Variance by PCA Components')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.grid()
    if (saveCSV): plt.savefig(os.path.join(directory, 'zoomed_var_cumulative_x_component.png'))
    plt.show()
    
    return reduced_df, pca

For each Principal Component, find the top "n" features that contribute most to the variance of that component.

In [57]:
def analyze_PC(data, pca, n_components):
    loadings = pca.components_
    loadings_df = pd.DataFrame(data=loadings.T, index=data.columns, columns=[f'PC{i+1}' for i in range(loadings.shape[0])])
    column = []

    top_n_features = get_top_n_features(loadings_df, n_components)

    for pc, features in top_n_features.items():
        #print(f"Top {n_components} features for {pc}: {features}") #uncomment to see the top features per PC
        for feature in features:
            if feature not in column:
                column.append(feature)
    print("available features: ", len(data.columns))
    print("features used: ", len(column))

    difference = set(data.columns) - set(column)
    print("Unused Features:", difference)

    return difference

### Lloyd-Max Quantization 

In [58]:
def lloyd_max_quantization(data, num_levels=16, max_iter=100, delta=1e-6):
    min_val = np.min(data)
    max_val = np.max(data)
    centroids = np.linspace(min_val, max_val, num_levels) #Uniformly spaced 

    for _ in range(max_iter):
        thresholds = (centroids[:-1] + centroids[1:]) / 2 #Defines intervals of centroids
        indices = np.digitize(data, thresholds) #Assign each data point to a cluster
        
        new_centroids = np.array([data[indices == i].mean() for i in range(num_levels)]) #Update centroids to better represent the data
        
        empty_centroids = np.isnan(new_centroids) #Restore previous cluster if empty
        new_centroids[empty_centroids] = centroids[empty_centroids] 

        #stop if changes between iterations are small
        if np.max(np.abs(new_centroids - centroids)) < delta:
            break

        centroids = new_centroids

    quantized_data = centroids[indices]   #Quantize the data based on the final centroids
    indices = indices.reshape(data.shape) #Reshape indices to match the original data shape

    return quantized_data, centroids, indices

def dequantize_lloyd_max(quantized_data, clusters, thresholds):
    indices = np.digitize(quantized_data, thresholds, right=True)
    return clusters[indices]

def apply_quantization(reduced_df, lvls):
    quantized_data, centroids, indices = lloyd_max_quantization(reduced_df.values, num_levels=lvls)
    df_quantized = pd.DataFrame(quantized_data, columns=reduced_df.columns)
    return df_quantized, centroids, indices

def apply_existing_quantization(data, centroids):
    thresholds = (centroids[:-1] + centroids[1:]) / 2
    indices = np.digitize(data, thresholds)
    quantized_data = centroids[indices] #Quantize the data based on the final centroids
    indices = indices.reshape(data.shape) #Reshape indices to match the original data shape

    df_quantized = pd.DataFrame(quantized_data, columns=data.columns)
    return df_quantized, indices

### Entropy Coding (Huffman)

In [59]:
def compute_entropy(data, verbose=True):
    entropy = 0
    for col in data.columns:
        freq_per_data = Counter(data[col])  # Get frequency of each unique value
        total_count = sum(freq_per_data.values())
        col_entropy = 0
        for count in freq_per_data.values():
            p_i = count / total_count  # probability of each unique value
            col_entropy += -p_i * np.log2(p_i)  # Entropy formula
        if verbose: print(f"Entropy of column {col}: {col_entropy} bits")
        entropy += col_entropy
    return entropy.round(2)

In [60]:
def apply_encoding(df_quantized):
    encoded_df, huffman_codes = apply_huffman_encode_per_feature(df_quantized)
    return encoded_df, huffman_codes

In [61]:
def apply_decoding(encoded_df, huffman_codes):
    decoded_df = apply_huffman_decode_per_feature(encoded_df.iloc[:, 2:-1], huffman_codes)
    return decoded_df

### Reconstruction

Reconstruct the dataset (without CSI components) and save it in csv

In [62]:
def reconstruct_data(decoded_df, encoded_df, pca, scaler, data):

    df_scaled_reconstructed = pca.inverse_transform(decoded_df.values)
    df_reconstructed = scaler.inverse_transform(df_scaled_reconstructed)

    """
    print('Original data shape:', df.shape)
    print('Scaled data shape:', reduced_data.shape)
    print('PCA components shape:', reduced_df.shape)
    print('Reconstructed scaled data shape:', df_scaled_reconstructed.shape)
    print('Reconstructed original data shape:', df_reconstructed.shape)
    """
    
    df_reconstructed = pd.DataFrame(df_reconstructed, columns=data.columns)
    df_reconstructed = pd.concat([encoded_df.iloc[:, [0, 1, -1]], df_reconstructed], axis=1)

    if method == Compression_Method.XY:
        for j in interestedIndexes:
            df_reconstructed[f'CSI{j}'] = df_reconstructed.apply(lambda x: complex_rebuild(x[f'X{j}'], x[f'Y{j}']), axis=1)
                
            #compute back ampl and phases
            df_reconstructed[f'Ampl{j}'] = df_reconstructed[f'CSI{j}'].apply(abs)
            df_reconstructed[f'Phase{j}'] = df_reconstructed[f'CSI{j}'].apply(cmath.phase)

        df_reconstructed.drop(columns=[f'X{j}' for j in interestedIndexes], inplace=True)
        df_reconstructed.drop(columns=[f'Y{j}' for j in interestedIndexes], inplace=True)
        
    return df_reconstructed


In [63]:
def plot_ampl_comparison(data, reconstruct_data, column, directory): #USELESS

    data = data[(data['Timestamp'] >= 0) & (data['Timestamp'] <= 1200)]
    reconstruct_data = reconstruct_data[(reconstruct_data['Timestamp'] >= 0) & (data['Timestamp'] <= 1200)]
    
    plt.figure(figsize=(20, 6))

    columns = [f'Ampl{column}', f'Ampl-{column}'] 

    for i in range(columns.__len__()):
        if (columns[i] not in data.columns) or (columns[i] not in reconstruct_data.columns):
            print(f"Column {columns[i]} not found in the data")
            return

        # Plot the original data
        plt.plot(data['Timestamp'], data[columns[i]], label=f'Original {columns[i]}', color="red" if i == 0 else "green")

        # Plot the reconstructed data
        plt.plot(reconstruct_data['Timestamp'], reconstruct_data[columns[i]], label=f'Reconstructed {columns[i]}', color="blue" if i == 0 else "orange")

        plt.ylim(0, 3000)
    
    # Add plot details
    plt.title('Amplitude Comparison')
    plt.xlabel('Timestamp')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.grid()
    if (saveCSV): plt.savefig(os.path.join(directory, f'Ampl{column}_Ampl-{column}_comparison.png'))
    plt.show()


In [64]:
def plot_MuStdAmplPaper(data, reconstructed_data, directory, level):
    print("Plotting MuStdAmplPaper")
    sub_directory = os.path.join(directory, 'MuStdAmplPaper_Comparison')
    os.makedirs(sub_directory, exist_ok=True)
    plt.figure(figsize=(20, 6))
    
    plt.plot(data['Time'], data['MuStdAmplPaper'], label=f'Original MuStdAmplPaper', color="blue" )
    plt.plot(reconstructed_data['Time'], reconstructed_data['MuStdAmplPaper'], label=f'Reconstructed MuStdAmplPaper', color="green")
    
    gt = [min(data['MuStdAmplPaper']) if l == 0 else max(data['MuStdAmplPaper']) for l in reconstructed_data["Label"]]
    plt.plot(data['Time'],gt,label="Ground-truth",color="r",ls="--", linewidth=0.5) # per window GT

    
    # Add plot details
    plt.title('Amplitude Comparison')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.grid()
    if (saveCSV): plt.savefig(os.path.join(sub_directory, f'{level}.png'))
    plt.show()

## Classification (partially from another notebook):

In [65]:
filteredFeaturesPresence = pd.read_csv("datasets/filteredFeaturesPresence3s.csv")
def load_comparison():
    filteredFeaturesPresence = pd.read_csv("datasets/filteredFeaturesPresence3s.csv")
    #apply labeling based on GT
    filteredFeaturesPresence["Label"]= filteredFeaturesPresence["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds))
    orig_auc = classify_presence(filteredFeaturesPresence, plot_roc=False)
    return orig_auc

In [66]:
def apply_filtering(df_reconstructed):
    reconstructedPresence = df_reconstructed

    if method == Compression_Method.AmpPhaseFiltered:
        reconstructed_filtered = reconstructedPresence
    else:
        reconstructed_filtered = filterData2(reconstructedPresence) #removes outliers
    reconstructed_filtered.drop(columns=[col for col in reconstructed_filtered.columns if col.startswith('CSI')], inplace=True); #Removes CSI columns

    return reconstructed_filtered

In [67]:
def apply_classification(reconstructed_filtered):
    #compute features
    reconstructed_featured = extractWindowedFeatures(reconstructed_filtered,column_indexes = reconstructed_filtered.columns,w2=w2)
    reconstructed_featured["Label"] = reconstructed_featured["Time"].apply(lambda x: getGT(x,lower_bounds,upper_bounds)) #assign the ground-truth to a label

    #classify
    auc_value = classify_presence(reconstructed_featured,plot_roc=False)
    print(auc_value)

    return reconstructed_featured, auc_value

## MLP

In [68]:
semantic_classes = ["Walk", "Run", "Jump", "Sit", "Empty"]
base_directory = './results/VAE'
os.makedirs(base_directory, exist_ok=True)

In [69]:
def load_experiment(directory, scaler=None):
    data = None
    labels = None

    with open(directory, 'rb') as f:
        data, labels = pickle.load(f)
    
    fcolumns = ['mu1','mu2','sigma1','sigma2']

    labels = np.asarray(labels, dtype=np.int32)
    
    df = pd.DataFrame(data, columns=fcolumns)
    df['signal'] = labels
    
    if scaler is None:
        scaler = StandardScaler().fit(df[fcolumns])
    df[fcolumns] = scaler.transform(df[fcolumns])
    
    X = df[fcolumns]
    y = df['signal']

    # one-hot-encoding
    y_dummy = keras.utils.to_categorical(y)
    
    return X, y, y_dummy, scaler, fcolumns

## EDL

In [70]:
num_epochs_annealing = 1
num_classes = 5

ep = 1.0
class GetEpochs(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        global ep
        ep += 1

def res_to_mean(ev, dim = 5):
    return np.max(dirichlet.mean(ev.reshape(dim,)+1))

def res_to_dirichlet(ev):
    alpha = ev.reshape(2,)+1
    S = np.sum(alpha)
    K = 2.0
    return dirichlet.mean(alpha), K/S

def edl_accuracy(yTrue, yPred):
    pred = K.argmax(yPred, axis=1)
    truth = K.argmax(yTrue, axis=1)
    match = K.reshape(K.cast(K.equal(pred, truth), "float32"),(-1,1))
    return K.mean(match)

def load_edl_experiment(name):
    keras.models.load_model(name)

def plot_res_beta(ev):
    alpha = ev.reshape(2,)+1
    plt.figure(figsize=(16,9))
    x = np.linspace(0,1,1000)
    plt.plot(x, beta.pdf(x, alpha[1], alpha[0]))
    x1, x2 = beta.interval(0.95, alpha[1], alpha[0])
    areaplot = np.multiply(beta.pdf(x, alpha[1],alpha[0]), rect(x,x1, x2))
    plt.fill_between(x, 0, areaplot, alpha=0.5)

def results_test (train_dir, test_dir, num_components=0, num_levels=0, default=False):
    X_train, y_train, y_train_dummy, scaler, fcolumns = load_experiment(train_dir)
    X_test, y_test, y_test_dummy, _, fcolumns = load_experiment(test_dir, scaler)
    if default:
        model_directory = os.path.join(base_directory, f'0_components/models/0components_0lvls_Keras_Model.keras')
    else:
        model_directory = os.path.join(base_directory, f'{num_components}_components/models/{num_components}components_{num_levels}lvls_Keras_Model.keras')
    
    mlp_edl = keras.models.load_model(model_directory, compile=False)
    mlp_edl_scores = np.array([res_to_mean(r, dim=5) for r in mlp_edl.predict(X_test)])
    y_predictions_edl = np.array(tf.argmax(mlp_edl.predict(X_test), axis=1))

    print(summary_clf(y_test, y_predictions_edl, mlp_edl_scores))
    accuracy = accuracy_score(y_test, y_predictions_edl)

    cm = confusion_matrix(y_test, y_predictions_edl)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=semantic_classes)
    cmdisp = disp.plot(cmap="cividis")
    CM_directory = os.path.join(base_directory, f'{num_components}_components/CMs/{num_components}components_{num_levels}lvls_ConfusionMatrix.png')
    os.makedirs(os.path.dirname(CM_directory), exist_ok=True)
    cmdisp.figure_.savefig(CM_directory, bbox_inches='tight')

    return round(accuracy, 5)

In [71]:
def run_edl_experiment(name, _X_train, _y_train_dummy, num_components=0, num_levels=0, X_val=None, y_val_dummy=None):

    model_edl = None
    num_classes = 5
    
    if name == "Delayed-Fusing":
        num_epochs_annealing = 3
        batch_size = 128
        lr = 0.01
        epochs = 50
        model_edl = tf.keras.models.Sequential()
        model_edl.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(16,)))
        model_edl.add(tf.keras.layers.Dense(8, activation='relu'))
        model_edl.add(tf.keras.layers.Dense(units=5, activation='softplus'))

    elif name == "Early-Fusing":
        num_epochs_annealing = 22
        batch_size = 128
        lr = 0.001
        epochs = 50
        model_edl = tf.keras.models.Sequential()
        model_edl.add(tf.keras.layers.Dense(8, activation='relu', input_shape=(4,)))
        model_edl.add(tf.keras.layers.Dense(8, activation='relu'))
        model_edl.add(tf.keras.layers.Dense(units=5,activation='softplus'))

    elif name == "Early-Fusing3":
        num_epochs_annealing = 22
        batch_size = 128
        lr = 0.01
        epochs = 50
        model_edl = tf.keras.models.Sequential()
        model_edl.add(tf.keras.layers.Dense(8, activation='relu', input_shape=(6,)))
        model_edl.add(tf.keras.layers.Dense(8, activation='relu'))
        model_edl.add(tf.keras.layers.Dense(units=5,activation='softplus'))

    else:
    
        num_epochs_annealing = 22
        batch_size = 64
        lr = 0.001
        epochs = 100
        model_edl = tf.keras.models.Sequential()
        model_edl.add(tf.keras.layers.Input(shape=(4,)))
        model_edl.add(tf.keras.layers.Dense(32, activation='relu'))
        model_edl.add(tf.keras.layers.Dropout(0.4))
        model_edl.add(tf.keras.layers.Dense(64, activation='relu'))
        model_edl.add(tf.keras.layers.Dropout(0.4))
        model_edl.add(tf.keras.layers.Dense(32, activation='relu'))
        model_edl.add(tf.keras.layers.Dense(5, activation='softplus'))
        """
        num_epochs_annealing = 22
        batch_size = 128
        lr = 0.01
        epochs = 100
        model_edl = tf.keras.models.Sequential()
        model_edl.add(tf.keras.layers.Dense(4, activation='relu', kernel_initializer=keras.initializers.GlorotUniform(seed=random_state), input_shape=(4,)))
        model_edl.add(tf.keras.layers.Dense(8, activation='relu', kernel_initializer=keras.initializers.GlorotUniform(seed=random_state)))
        model_edl.add(tf.keras.layers.Dense(units=5,activation='softplus'))
        """

    def KL(alpha):
        beta=K.constant(np.ones((1,num_classes)),dtype="float32")
        S_alpha = K.sum(alpha,axis=1,keepdims=True)
        S_beta = K.sum(beta,axis=1,keepdims=True)
        lnB = tf.math.lgamma(S_alpha) - K.sum(tf.math.lgamma(alpha),axis=1,keepdims=True)
        lnB_uni = K.sum(tf.math.lgamma(beta),axis=1,keepdims=True) - tf.math.lgamma(S_beta)

        dg0 = tf.math.digamma(S_alpha)
        dg1 = tf.math.digamma(alpha)

        return K.sum((alpha - beta)*(dg1-dg0),axis=1,keepdims=True) + lnB + lnB_uni

    # Loss function considering the expected squared error and the KL divergence
    def mse_loss(yTrue,yPred):
        alpha = yPred + 1
        S = K.sum(alpha, axis=1, keepdims=True)
        m = alpha / S

        # A + B minimises the sum of squared loss, see discussion in EDL paper for the derivation
        A = K.sum((yTrue-m)**2, axis=1, keepdims=True)
        B = K.sum(alpha*(S-alpha)/(S*S*(S+1)), axis=1, keepdims=True)

        # the lambda_t parameter, in this case min{1, t/10} with t the number of epochs
        ll = min(1.0, float(ep/float(num_epochs_annealing)))
        
        alp = yPred*(1-yTrue) + 1 
        C =  ll * KL(alp)

        return A + B + C

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model_edl.compile(loss=mse_loss, optimizer=optimizer, metrics=[edl_accuracy])

    model_edl.fit(_X_train, _y_train_dummy,
      batch_size=batch_size,
      epochs=epochs,
      verbose=1,
      shuffle=False)

    model_directory = os.path.join(base_directory, f'{num_components}_components/models/{num_components}components_{num_levels}lvls_Keras_Model.keras')
    os.makedirs(os.path.dirname(model_directory), exist_ok=True)
    model_edl.save(model_directory)


# Runs

#### Iterative Runs

In [None]:
components = [i for i in range(1, 20)]
num_levels = [2, 4, 8, 16, 32, 64, 128]

results = []
orig_auc = load_comparison()

for n_components in components:
    directory = os.path.join(base_directory, f'{n_components}_components')
    os.makedirs(directory, exist_ok=True)
    
    # Read and preprocess data
    df = pd.read_csv('datasets/presence.csv')
    data = data_preprocessing(df, method)
    
    # PCA
    scaled_data = scaler.fit_transform(data)
    reduced_df, pca = analyze_PCA(scaled_data, n_components, directory)
    unused_features = analyze_PC(data, pca, n_components)

    for level in num_levels:
        sub_directory = os.path.join(directory, f'lvls_{level}')
        os.makedirs(sub_directory, exist_ok=True)

        # Lloyd-Max Quantization
        quantized_df = apply_quantization(reduced_df, level)

        # Encode-Decode
        encoded_df, huffman_codes = apply_encoding(quantized_df)
        encoded_df = pd.concat([df[['Frame_num', 'Timestamp']], encoded_df, df['TimeWindow']], axis=1)
        decoded_df = apply_decoding(encoded_df, huffman_codes)
        print("Original DataFrame equals Decoded DataFrame:", quantized_df.equals(decoded_df)) #Correctness check

        # Reconstruction
        reconstructed_df = reconstruct_data(decoded_df, encoded_df, pca, scaler, data)
        print("original_auc:",orig_auc)

        reconstructed_df.drop(columns=unused_features, inplace=True)

        # Filtering and Classification
        reconstructed_filtered = apply_filtering(reconstructed_df)
        print(f"----------Results with {n_components} components, num_levels {level} ----------")
        reconstructed_featured_labeled, auc_value = apply_classification(reconstructed_filtered)

        results.append({
            'n_components': n_components,
            'num_levels': level,
            'AUC': auc_value,
            'Original AUC': orig_auc
        })

        plot_MuStdAmplPaper(filteredFeaturesPresence, reconstructed_featured_labeled, sub_directory, level)
        
        if (saveCSV):
            encoded_df.to_csv(os.path.join(sub_directory, 'encodedQuantizedPCAPresence.csv'), index=False)
            reconstructed_df.to_csv(os.path.join(sub_directory, 'presence_reconstructed.csv'), index=False)
            reconstructed_filtered.to_csv(os.path.join(sub_directory, 'filteredPresence_reconstructed.csv'), index=False)
            reconstructed_featured_labeled.to_csv(os.path.join(sub_directory, 'filteredFeaturesLabeledPresence_reconstructed.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join(base_directory, 'classification_results.csv')
results_df.to_csv(results_file, index=False)

#### AUC/BIT comparison

In [None]:
#Define your targets and num_levels to iterate over
components = [i for i in range(1, 20)]
num_levels = [2, 4, 8, 16, 32, 64, 128]

results = []
orig_auc = load_comparison()

for n_components in components:
    directory = os.path.join(base_directory, f'{n_components}_components')
    os.makedirs(directory, exist_ok=True)
    
    # Read and preprocess data
    df = pd.read_csv('datasets/presence.csv')
    data = data_preprocessing(df, method)
    
    # PCA
    scaled_data = scaler.fit_transform(data)
    reduced_df, pca = analyze_PCA(scaled_data, n_components, directory)

    entropyPCA = compute_entropy(reduced_df, verbose=False)
    print(f"Entropy of the PCA data: {entropyPCA:.2f} bits")
    PCA_bits, PCA_win_bits, total_PCA_bits = bits_needed(reduced_df, df, verbose=False)

    for level in num_levels:
        sub_directory = os.path.join(directory, f'lvls_{level}')
        os.makedirs(sub_directory, exist_ok=True)

        # Lloyd-Max Quantization
        quantized_df = apply_quantization(reduced_df, level)

        # Encode-Decode
        encoded_df, huffman_codes = apply_encoding(quantized_df)
            
        entropyENC = compute_entropy(encoded_df, verbose=False)
        print(f"Entropy of the encoded data: {entropyENC:.2f} bits")
        ENC_bits, ENC_win_bits, total_ENC_bits = bits_needed(encoded_df, df, level, verbose=False)
        
        # Reconstruction
        reconstructed_df = pd.read_csv(os.path.join(sub_directory, 'presence_reconstructed.csv'))

        entropyREC = compute_entropy(reconstructed_df, verbose=False)
        print(f"Entropy of the reconstructed data: {entropyREC:.2f} bits")
        REC_bits, REC_win_bits, total_REC_bits = bits_needed(reconstructed_df.iloc[:, 3:], df, verbose=False)

        print(f"----------Results with {n_components} components, num_levels {level} ----------")
        reconstructed_filtered = pd.read_csv(os.path.join(sub_directory, 'filteredPresence_reconstructed.csv'))
        reconstructed_featured, auc_value = apply_classification(reconstructed_filtered)

        results.append({
            'n_components': n_components,
            'num_levels': level,
            'AUC': auc_value,
            'Original AUC': orig_auc,
            'PCA_bits': PCA_bits,
            'ENC_bits': ENC_bits,
            'REC_bits': REC_bits,
            'PCA_win_bits': PCA_win_bits,
            'ENC_win_bits': ENC_win_bits,
            'REC_win_bits': REC_win_bits,
            'total_PCA_bits': total_PCA_bits,
            'total_ENC_bits': total_ENC_bits,
            'total_REC_bits': total_REC_bits,
            'entropyPCA': entropyPCA,
            'entropyENC': entropyENC,
            'entropyREC': entropyREC
        })


# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join('./results_csv', 'new_auc_bit_comparison.csv')
results_df.to_csv(results_file, index=False)

In [None]:
auc_bit = pd.read_csv(os.path.join('./results_csv', 'new_auc_bit_comparison.csv'))
#components = [1,2,3,4,5,6,8,10,12,14,16,18]
components = [1,2,3,4,5,8,10,12,14,18]

plt.figure(figsize=(20, 10))
for n_components in auc_bit['n_components'].unique():
    if(n_components not in components): continue
    target_data = auc_bit[auc_bit['n_components'] == n_components]
    plt.plot(target_data['ENC_win_bits'], target_data['AUC'], marker='o', linestyle='--', label=f'{n_components} components')
#plt.ylim(0.7)
plt.title('AUC and Bits Comparison')
plt.xlabel('Avg bits per window')
plt.ylabel('AUC')
plt.ylabel('AUC')
plt.legend()
plt.grid()
plt.savefig(os.path.join('./results_graphs', 'auc_bit_comparison[BxW][filtered].png'))
plt.show()

In [None]:
auc_bit = pd.read_csv(os.path.join('./results_csv', 'new_auc_bit_comparison.csv'))
#components = [i for i in range(1, 20)]
#components = [1,2,3,4,5,6,8,10,12,14,16,18]
components = [1,2,3,4,5,8,10,12,14]

plt.figure(figsize=(20, 10))
for n_components in auc_bit['n_components'].unique():
    if (n_components not in components): continue
    target_data = auc_bit[auc_bit['n_components'] == n_components]
    plt.plot(target_data['ENC_bits'], target_data['AUC'], marker='o', linestyle='--', label=f'{n_components} components')
#plt.ylim(0.7)
plt.title('AUC with quantized data and PCA applied')
plt.xlabel('bits per symbol')
#plt.xticks(np.arange(0, 110, 10))
plt.ylabel('AUC')
plt.legend(loc='lower right')
plt.grid()
plt.savefig(os.path.join('./results_graphs', 'auc_bit_comparison[BxS][filtered].png'))
plt.show()


### PCA ONLY

In [None]:
components = [i for i in range(1, 20)]

results = []
orig_auc = load_comparison()

for n_components in components:
    directory = os.path.join('./results/PCA_ONLY', f'{n_components}_components')
    os.makedirs(directory, exist_ok=True)
    
    # Read and preprocess data
    df = pd.read_csv('datasets/presence.csv')
    data = data_preprocessing(df, method)
    
    # PCA
    scaled_data = scaler.fit_transform(data)
    reduced_df, pca = analyze_PCA(scaled_data, n_components, directory)
    unused_features = analyze_PC(data, pca, n_components)

    reconstructed_scaled_df = pca.inverse_transform(reduced_df.values)
    reconstructed_df = scaler.inverse_transform(reconstructed_scaled_df)
    reconstructed_df = pd.DataFrame(reconstructed_df, columns=data.columns)
    reconstructed_df = pd.concat([df[['Frame_num', 'Timestamp']], reconstructed_df, df['TimeWindow']], axis=1)
    reconstructed_df.drop(columns=unused_features, inplace=True)

    # Filtering and Classification
    reconstructed_filtered = apply_filtering(reconstructed_df)
    print(f"--------------- Results with {n_components} ---------------")
    reconstructed_featured, auc_value = apply_classification(reconstructed_filtered)

    results.append({
        'n_components': n_components,
        'AUC': auc_value,
        'Original AUC': orig_auc
    })
    
    if (saveCSV):
        reconstructed_df.to_csv(os.path.join(directory, 'presence_reconstructed.csv'), index=False)
        reconstructed_filtered.to_csv(os.path.join(directory, 'filteredPresence_reconstructed.csv'), index=False)
        reconstructed_featured.to_csv(os.path.join(directory, 'filteredFeaturesLabeledPresence_reconstructed.csv'), index=False)

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join('./results_csv/PCA_ONLY', 'classification_results.csv')
os.makedirs(os.path.dirname(results_file), exist_ok=True)
results_df.to_csv(results_file, index=False)

In [None]:
#Define your targets and num_levels to iterate over
components = [i for i in range(1, 20)]

results = []
orig_auc = load_comparison()

for n_components in components:
    directory = os.path.join('./results/PCA_ONLY', f'{n_components}_components')
    os.makedirs(directory, exist_ok=True)
    
    # Read and preprocess data
    df = pd.read_csv('datasets/presence.csv')
    data = data_preprocessing(df, method)
    
    # PCA
    scaled_data = scaler.fit_transform(data)
    reduced_df, pca = analyze_PCA(scaled_data, n_components, directory)

    entropyPCA = compute_entropy(reduced_df, verbose=False)
    print(f"Entropy of the PCA data: {entropyPCA:.2f} bits")
    PCA_bits, PCA_win_bits, total_PCA_bits = bits_needed(reduced_df.round(5), df, verbose=False)

    # Reconstruction
    reconstructed_df = pd.read_csv(os.path.join(directory, 'presence_reconstructed.csv'))

    entropyREC = compute_entropy(reconstructed_df.iloc[:, 3:], verbose=False)
    print(f"Entropy of the reconstructed data: {entropyREC:.2f} bits")
    REC_bits, REC_win_bits, total_REC_bits = bits_needed(reconstructed_df.iloc[:, 3:], df, verbose=False)

    print(f"----------Results with {n_components} components ----------")
    reconstructed_filtered = pd.read_csv(os.path.join(directory, 'filteredPresence_reconstructed.csv'))
    reconstructed_featured, auc_value = apply_classification(reconstructed_filtered)

    results.append({
        'n_components': n_components,
        'num_levels': level,
        'AUC': auc_value,
        'Original AUC': orig_auc,
        'PCA_bits': PCA_bits,
        'REC_bits': REC_bits,
        'PCA_win_bits': PCA_win_bits,
        'REC_win_bits': REC_win_bits,
        'total_PCA_bits': total_PCA_bits,
        'total_REC_bits': total_REC_bits,
        'entropyPCA': entropyPCA,
        'entropyREC': entropyREC
    })

# Convert results list to DataFrame
results_df = pd.DataFrame(results)
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_file = os.path.join('./results_csv/PCA_ONLY', 'auc_bit_comparison.csv')
os.makedirs(os.path.dirname(results_file), exist_ok=True)
results_df.to_csv(results_file, index=False)

In [None]:
auc_bit = pd.read_csv(os.path.join('./results_csv/PCA_ONLY', 'auc_bit_comparison.csv'))
components = [i for i in range(1, 20)]

plt.figure(figsize=(20, 10))
plt.plot(auc_bit['PCA_win_bits'], auc_bit['AUC'], marker='o', linestyle='--')
plt.ylim(0.85, 1.01)
plt.title('AUC with PCA applied')
plt.xlabel('Avg bits per window')
plt.xticks(np.arange(0, 150, 10))
plt.ylabel('AUC')
plt.legend()
plt.grid()
#plt.savefig(os.path.join('./results_graphs', 'auc_bit_comparison_PCA-Only[BxW].png'))
plt.show()


### VAE Training

In [None]:
data = pd.read_csv("datasets/presence.csv")
data = data_preprocessing(data, method)

In [None]:
time_window = 1 #seconds
labels = pd.DataFrame()
#print("TimeWindow:", df["TimeWindow"].unique())
data["TimeWindow"] = np.floor(data["Timestamp"] / time_window) * time_window
labels["TimeWindow"] = data["TimeWindow"].unique()
labels["Label"] = labels["TimeWindow"].apply(lambda x: getGT(x,lower_bounds,upper_bounds)) #assign the ground-truth to a label

df = data.copy()
df.drop(columns=["Timestamp"], inplace=True)

In [None]:
"""timewindow_counts = df['TimeWindow'].value_counts(sort=False)

print("average count:",timewindow_counts.mean())
print("max count:",timewindow_counts.max())
print("min count:",timewindow_counts.min())

counter = [x for x in timewindow_counts if x < 20]
print("\nnumber of windows with less than 20 samples:",len(counter))
print(sorted(counter))

counter = [x for x in timewindow_counts if x >= 20 and x < 46]
print("\nnumber of windows between 20 and 46:",len(counter))
print(sorted(counter))

counter = [x for x in timewindow_counts if x >= 46 and x < 100]
print("\nnumber of windows between 46 and 100:",len(counter))
print(sorted(counter))

counter = [x for x in timewindow_counts if x >= 100]
print("\nnumber of windows with more than 100 samples:",len(counter))
print(sorted(counter))"""

In [None]:
# Window size
window_size = 20 # for 1 second
#window_size = 75 # for 3 second
timewindow_counts = df['TimeWindow'].value_counts(sort=False) # Count the number of samples per time window

# Drop time windows with less than "window_size" samples
filtered_df = df.groupby('TimeWindow').filter(lambda x: len(x) >= window_size)
print("Number of time windows to drop:", len(df['TimeWindow'].unique()) - len(filtered_df['TimeWindow'].unique()))

In [None]:
#inside each "second" first we divide in train/test and then we divide in windows
train_windows = []
train_labels = []
test_windows = []
test_labels = []
window_size = 6 # packets per window

for time_window in filtered_df["TimeWindow"].unique():
    data_window = filtered_df[filtered_df["TimeWindow"] == time_window].drop("TimeWindow", axis=1)
    data_window = np.array(data_window)
    train, test = train_test_split(data_window, test_size=0.4, random_state=42, shuffle=False, stratify=None)
    label = labels[labels["TimeWindow"] == time_window]["Label"].values[0]

    for start_idx in range(len(train) - window_size + 1): #minimum amount of samples per window = 12
        window = train[start_idx : start_idx + window_size]
        train_windows.append(window)
        train_labels.extend([label] * (window_size - 1))

    for start_idx in range(len(test) - window_size + 1): #minimum amount of samples per window = 6
        window = test[start_idx : start_idx + window_size]
        test_windows.append(window)
        test_labels.extend([label] * (window_size - 1))

train_windows = np.array(train_windows)
train_labels = np.array(train_labels)
test_windows = np.array(test_windows)
test_labels = np.array(test_labels)

print("Original number of windows:", len(filtered_df["TimeWindow"].unique()))
print("Number of train windows:", train_windows.shape)   
print("Number of train labels:", train_labels.shape)
print("Number of test windows:", test_windows.shape)
print("Number of test labels:", test_labels.shape)

In [None]:
#convert in tensor
train_data_tf = tf.convert_to_tensor(train_windows, dtype=tf.float32)
test_data_tf = tf.convert_to_tensor(test_windows, dtype=tf.float32)

#reshape
train_data_tf = tf.reshape(train_data_tf, (-1, 56))
test_data_tf = tf.reshape(test_data_tf, (-1, 56))

#normalize
train_data_tf = tf.math.divide(train_data_tf, tf.math.reduce_max(train_data_tf, axis=(0, 1)))
test_data_tf = tf.math.divide(test_data_tf, tf.math.reduce_max(test_data_tf, axis=(0, 1)))


train_labels_tf = tf.convert_to_tensor(train_labels, dtype=tf.int32)
test_labels_tf = tf.convert_to_tensor(test_labels, dtype=tf.int32)

train_labels_tf = tf.reshape(train_labels_tf, (-1))
test_labels_tf = tf.reshape(test_labels_tf, (-1))

train_indices_tf = tf.convert_to_tensor(tf.range(0, train_labels_tf.shape[0], dtype=tf.int32))
test_indices_tf = tf.convert_to_tensor(tf.range(0, test_labels_tf.shape[0], dtype=tf.int32))

print(f"Final train_data size: {train_data_tf.shape}")
print(f"Final test_data size: {test_data_tf.shape}")
print(f"Final train_labels size: {train_labels_tf.shape}")
print(f"Final test_labels size: {test_labels_tf.shape}")
print(f"Final train_indices size: {train_indices_tf.shape}")
print(f"Final test_indices size: {test_indices_tf.shape}")

In [None]:
train_data = CsiData(train_data_tf, train_labels_tf, train_indices_tf, batch_size=128, window_size=6)
test_data = CsiData(test_data_tf, test_labels_tf, test_indices_tf, batch_size=128, window_size=6)

In [None]:
print(train_data.csi.shape)
print(train_data.labels.shape)
print(train_data.indices.shape)
print(train_data[0])

In [None]:
vae = VAE()
vae.compile(optimizer=tf_keras.optimizers.Adam())
vae.save_weights(checkpoint_path.format(epoch=0))
vae.fit(train_data, epochs=5, shuffle=True, callbacks=[checkpoint_cb, csv_logger_cb, early_stopping_cb])
vae.save_weights(f'./{folder_name}/train_weights_vae')

## VAE Output Quantized

In [None]:
#Read data
#preprocess data
#split train/test

In [None]:
z_data = np.zeros([0, 2])
z_labels = np.zeros([0])

vae = VAE(enc_input_shape=(6, 56, 1))
vae.compile(optimizer=tf_keras.optimizers.Adam())
vae.load_weights(f'./{folder_name}/train_weights_vae').expect_partial()

z_data_train, z_labels_train = apply_vae_encoder(vae, train_data)
z_data_test, z_labels_test = apply_vae_encoder(vae, test_data)

In [42]:
base_directory = './results/DEFAULT/VAE_QNTZD'
directory = f'./dumps/DEFAULT/VAE_QNTZD/0_components'
os.makedirs(directory, exist_ok=True)

bit_results = []
levels = [2**i for i in range(1, 9)]

df_z_data_train = pd.DataFrame(z_data_train, columns=['z_mean', 'z_log_var'])
df_z_data_test = pd.DataFrame(z_data_test, columns=['z_mean', 'z_log_var'])

In [None]:
for lvl in levels:
   print(f"-------------- {lvl} lvls --------------")
   df_train_quantized, centroids, train_indices = apply_quantization(df_z_data_train, lvl) #LLoyd-Max quantization
   df_test_quantized, test_indices = apply_existing_quantization(df_z_data_test, centroids)
   
   print (f"DF_QUANTIZED")
   df_test_indices = pd.DataFrame(test_indices, columns=[f'z_mean_{i}' for i in range(2)] + [f'z_log_var_{i}' for i in range(2)])
   QT_feature, QT_frame, QT_window_feature, QT_window, QT_total = bits_needed(df_test_indices, lvl, verbose=True)

   z_data_train = df_train_quantized.to_numpy()
   z_data_test = df_test_quantized.to_numpy()

   sub_dir=os.path.join(directory, f'training/{lvl}lvls_single_antenna_{antenna}.pkl')
   os.makedirs(os.path.dirname(sub_dir), exist_ok=True)
   with open(sub_dir, 'wb') as f:
      pickle.dump([z_data_train, z_labels_train], f)

   sub_dir=os.path.join(directory, f'test/{lvl}lvls_single_antenna_{antenna}_test.pkl')
   os.makedirs(os.path.dirname(sub_dir), exist_ok=True)
   with open(sub_dir, 'wb') as f:
      pickle.dump([z_data_test, z_labels_test], f)

   bit_results.append({
         'num_levels': lvl,
         'QT_feature': QT_feature,
         'QT_frame': QT_frame,
         'QT_window': QT_window,
         'QT_total': QT_total
      })

bit_results = pd.DataFrame(bit_results)
bit_results.to_csv(f'./results_csv/DEFAULT/VAE_bits.csv', index=False)

In [None]:
directory = f'./dumps/DEFAULT/VAE_QNTZD/0_components'
base_directory = './results/DEFAULT/VAE_QNTZD'
levels = [2**i for i in range(1, 9)]
results = []
for num_levels in levels:  
    print(f"-------------- {num_levels} lvls --------------")
    filename = f'{num_levels}lvls_single_antenna_{antenna}'
    train_dump_dir = './dumps/DEFAULT/single_antenna_0.pkl'
    test_dump_dir=os.path.join(directory, f'test/{filename}_test.pkl')

    # Test model
    accuracy = results_test(train_dump_dir, test_dump_dir, num_levels=num_levels, default=True)
    results.append(
        {
            "num_levels": num_levels,
            "accuracy": accuracy
        })
        
results_df = pd.DataFrame(results)
os.makedirs('results_csv', exist_ok=True)
results_df.to_csv('results_csv/DEFAULT/VAE_accuracy.csv', index=False)