In [3]:
import os
import pandas as pd
import numpy as np
import pywt
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import re  # For parsing the CSI data string

# --- Configuration ---
base_folder = r"C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\CSI_raw" # Replace with your base folder path
environments = ["LY8_Pantry", "LY8-Room", "Makerspace"] # Subdirectories for different environments
output_base_folder = r"C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\data_preprocessing"
window_size = 200
feature_count = 150 # Determined from inspecting the CSI data column length
wavelet_name = 'db4'
wavelet_level = 3
key_feature_indices = [0, 1, 6, 7, 8, 9, 10] # Example indices - Adjust based on CSI wavelet feature analysis
additional_feature_indices = [4, 5] # Example indices - Adjust based on CSI wavelet feature analysis
selected_indices = key_feature_indices + additional_feature_indices

# --- Helper Functions ---
def parse_csi_string(csi_str):
    """Parses the CSI data string from the CSV and returns a numpy array of floats."""
    try:
        # Use regex to find all numbers within the brackets
        numbers_str = re.findall(r'[-+]?\d*\.\d+|\d+', csi_str)
        csi_values = [float(num) for num in numbers_str]
        return np.array(csi_values)
    except Exception as e:
        print(f"Error parsing CSI string: {csi_str}")
        print(f"Error details: {e}")
        return None

def load_and_parse_csi_data(file_path):
    """Loads a CSI CSV file and parses the 'data' column."""
    try:
        df = pd.read_csv(file_path)
        csi_data_list = []
        for index, row in df.iterrows():
            csi_str = row['data']
            csi_values = parse_csi_string(csi_str)
            if csi_values is not None:
                csi_data_list.append(csi_values)
        return np.array(csi_data_list)
    except Exception as e:
        print(f"Error loading or parsing CSV file: {file_path}")
        print(f"Error details: {e}")
        return None

def check_columns_csi(folder_path):
    """Checks if the parsed CSI data from files in the folder has the expected feature count."""
    sample_file_path = next((os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')), None)
    if sample_file_path:
        csi_data = load_and_parse_csi_data(sample_file_path)
        if csi_data is not None and csi_data.size > 0:
            return csi_data[0].shape[0] # Check the shape of the first parsed CSI array
        else:
            return 0 # Indicate no valid data found
    else:
        raise ValueError(f"No CSV files in folder {folder_path}")

def process_folder_csi(file_path, label, window_size, feature_count, output_folder):
    """Processes CSI data from a single file, pads/truncates, and saves."""
    all_data_list = []

    csi_data_array = load_and_parse_csi_data(file_path)
    if csi_data_array is not None and csi_data_array.size > 0:
        for data_row in csi_data_array: # Each row in CSV is a sample now
            if data_row.shape[0] < feature_count:
                padding = np.zeros(feature_count - data_row.shape[0])
                data_row = np.concatenate([data_row, padding])
            elif data_row.shape[0] > feature_count:
                data_row = data_row[:feature_count]
            all_data_list.append(data_row)

    if not all_data_list: # No valid data found in the folder
        print(f"Warning: No valid CSI data found in file: {file_path}")
        return

    all_data = np.array(all_data_list)

    # Padding rows if less than window_size - No padding for individual files now
    # Truncate to window_size - No truncation for individual files now
    # all_data = all_data[:window_size]

    output_file = os.path.join(output_folder, f"{label}_processed.csv")
    pd.DataFrame(all_data).to_csv(output_file, index=False, header=False)
    print(f"{label} data saved to: {output_file}, shape: {all_data.shape}")


def normalize_data(input_file, output_file):
    """Normalizes data using MinMaxScaler."""
    data = pd.read_csv(input_file, header=None).values
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(data)
    pd.DataFrame(normalized_data).to_csv(output_file, index=False, header=False)
    print(f"Normalized data saved to: {output_file}, shape: {normalized_data.shape}")

def extract_wavelet_features(data, wavelet=wavelet_name, level=wavelet_level):
    """Extracts wavelet features (mean, std, max, min) from data."""
    features = []
    for row in data:
        coeffs = pywt.wavedec(row, wavelet, level=level)
        feature_vector = []
        for coeff in coeffs:
            feature_vector.append(np.mean(coeff))
            feature_vector.append(np.std(coeff))
        feature_vector.append(np.mean(row))
        feature_vector.append(np.std(row))
        feature_vector.append(np.max(row))
        feature_vector.append(np.min(row))
        features.append(feature_vector)
    return np.array(features)

def process_wavelet(input_file, output_file):
    """Extracts wavelet features and saves them."""
    data = pd.read_csv(input_file, header=None).values
    features = extract_wavelet_features(data)
    np.save(output_file, features)
    print(f"Features saved to: {output_file}, shape: {features.shape}")

def plot_and_save_individual_features(stand_features, walk_features, title_prefix, output_folder):
    """Plots and saves individual feature distributions."""
    os.makedirs(output_folder, exist_ok=True)
    num_features = stand_features.shape[1] if stand_features is not None else walk_features.shape[1] if walk_features is not None else 0

    for i in range(num_features):
        plt.figure(figsize=(10, 6))
        if stand_features is not None:
            plt.plot(stand_features[:, i], label=f'Stand Feature {i+1}', linestyle='--', alpha=0.7, color='blue')
        if walk_features is not None:
            plt.plot(walk_features[:, i], label=f'Walk Feature {i+1}', linestyle='-', alpha=0.7, color='orange')

        plt.title(f"{title_prefix} - Feature {i+1}")
        plt.xlabel("Sample Index")
        plt.ylabel("Feature Value")
        plt.legend()
        plt.savefig(os.path.join(output_folder, f"feature_{i+1}.png"))
        plt.close()
        print(f"Feature {i+1} plot saved to {output_folder}")

# --- Main Script ---
if __name__ == "__main__":
    all_stand_features = []
    all_walk_features = []

    for env in environments:
        env_folder = os.path.join(base_folder, env)
        output_folder = os.path.join(output_base_folder, env)
        os.makedirs(output_folder, exist_ok=True)

        stand_file = None
        walk_file = None

        for filename in os.listdir(env_folder):
            if filename.endswith(".csv"):
                filepath = os.path.join(env_folder, filename)
                if "standing" in filename.lower():
                    stand_file = filepath
                elif "walking" in filename.lower():
                    walk_file = filepath

        current_env_stand_features = None
        current_env_walk_features = None

        if stand_file:
            print(f"Found 'standing' file: {stand_file}")
            process_folder_csi(stand_file, "standing", window_size, feature_count, output_folder)
            stand_processed_path = os.path.join(output_folder, "standing_processed.csv")
            stand_normalized_path = os.path.join(output_folder, "standing_normalized.csv")
            normalize_data(stand_processed_path, stand_normalized_path)
            stand_normalized_file = os.path.join(output_folder, "standing_normalized.csv")
            stand_wavelet_output = os.path.join(output_folder, "standing_features.npy")
            process_wavelet(stand_normalized_file, stand_wavelet_output)
            stand_features = np.load(stand_wavelet_output)
            selected_stand_features = stand_features[:, selected_indices] # Feature selection
            current_env_stand_features = selected_stand_features # Assign selected features
            stand_labels = np.zeros((selected_stand_features.shape[0], 1))  # Stand label is 0
        else:
            print(f"Warning: 'standing' file not found in {env_folder}")
            current_env_stand_features = None
            stand_labels = None

        if walk_file:
            print(f"Found 'walking' file: {walk_file}")
            process_folder_csi(walk_file, "walking", window_size, feature_count, output_folder)
            walk_processed_path = os.path.join(output_folder, "walking_processed.csv")
            walk_normalized_path = os.path.join(output_folder, "walking_normalized.csv")
            normalize_data(walk_processed_path, walk_normalized_path)
            walk_normalized_file = os.path.join(output_folder, "walking_normalized.csv")
            walk_wavelet_output = os.path.join(output_folder, "walking_features.npy")
            process_wavelet(walk_normalized_file, walk_wavelet_output)
            walk_features = np.load(walk_wavelet_output)
            selected_walk_features = walk_features[:, selected_indices] # Feature selection
            current_env_walk_features = selected_walk_features # Assign selected features
            walk_labels = np.ones((selected_walk_features.shape[0], 1))    # Walk label is 1
        else:
            print(f"Warning: 'walking' file not found in {env_folder}")
            current_env_walk_features = None
            walk_labels = None

        # Combine stand and walk data for the current environment
        env_final_data = None
        env_final_labels = None

        if current_env_stand_features is not None and current_env_walk_features is not None:
            env_final_data = np.vstack([current_env_stand_features, current_env_walk_features])
            env_final_labels = np.vstack([stand_labels, walk_labels])
        elif current_env_stand_features is not None:
            env_final_data = current_env_stand_features
            env_final_labels = stand_labels
        elif current_env_walk_features is not None:
            env_final_data = current_env_walk_features
            env_final_labels = walk_labels

        if env_final_data is not None:
            env_output_path = os.path.join(output_folder, f"{env}_features_selected.npz") # Output path for environment-specific combined data
            np.savez(env_output_path, data=env_final_data, labels=env_final_labels) # Save combined data with labels
            print(f"Optimized combined data for {env} saved to: {env_output_path}, shape: {env_final_data.shape}")

        # Plotting features per environment (optional, can be moved outside loop for combined plotting)
        feature_plots_output_folder = os.path.join(output_folder, "feature_plots")
        plot_and_save_individual_features(current_env_stand_features, current_env_walk_features, f"Env {env} - Individual Feature Comparison", feature_plots_output_folder)

        if current_env_stand_features is not None:
            all_stand_features.append(current_env_stand_features) # Append selected stand features for combined saving later
        if current_env_walk_features is not None:
            all_walk_features.append(current_env_walk_features) # Append selected walk features for combined saving later


    # --- Feature Selection and Final Data Saving (Combined Data) ---
    stand_features = all_stand_features[-1] if all_stand_features else None # Still using last env features for combined - consider all envs if needed
    walk_features = all_walk_features[-1] if all_walk_features else None # Still using last env features for combined - consider all envs if needed

    if all_stand_features and all_walk_features: # Changed condition to use all_stand_features and all_walk_features
        combined_stand_features = np.vstack(all_stand_features) # Combine selected stand features from all envs
        combined_walk_features = np.vstack(all_walk_features)   # Combine selected walk features from all envs

        stand_labels = np.zeros((combined_stand_features.shape[0], 1))  # Stand label is 0
        walk_labels = np.ones((combined_walk_features.shape[0], 1))    # Walk label is 1

        final_data = np.vstack([combined_stand_features, combined_walk_features])
        final_labels = np.vstack([stand_labels, walk_labels])

        final_output_path = os.path.join(output_base_folder, "optimized_features_combined.npz") # Saved in base processed folder
        np.savez(final_output_path, data=final_data, labels=final_labels)
        print(f"Optimized combined data saved to: {final_output_path}, shape: {final_data.shape}")
    else:
        print("Warning: No features were processed to perform combined feature selection and final saving.")

    print("Data processing complete.")

Found 'standing' file: C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\CSI_raw\LY8_Pantry\standing_pantry_csidata.csv
standing data saved to: C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\data_preprocessing\LY8_Pantry\standing_processed.csv, shape: (1340, 150)
Normalized data saved to: C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\data_preprocessing\LY8_Pantry\standing_normalized.csv, shape: (1340, 150)
Features saved to: C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\data_preprocessing\LY8_Pantry\standing_features.npy, shape: (1340, 12)
Found 'walking' file: C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\CSI_raw\LY8_Pantry\walking_pantry_csidata.csv
walking data saved to: C:\Users\Ivan Nathanael\Uni\Side Project\CSI_Location_Tracking\csi-project\data_preprocessing\LY8_Pantry\walking_processed.csv, shape: (1709, 150)
Normalized data saved to: C:\U