In [16]:
from fastai.tabular.all import *
import glob
import os
import ast

In [30]:


def get_csv_files(folder_path : Path):
    """
    Recursively searches a folder and its subfolders to find all file paths
    with the .csv extension using the glob module.

    Args:
        folder_path (str): The path to the folder to search.

    Returns:
        list: A list of strings, where each string is the full path to a CSV file
              found within the folder and its subfolders. Returns an empty list
              if no CSV files are found or if the folder path is invalid.
    """
    csv_file_paths = []
    if not os.path.isdir(folder_path):
        print(f"Error: '{folder_path}' is not a valid directory.")
        return csv_file_paths  # Return empty list for invalid path

    # Construct the glob pattern for recursive CSV files
    pattern = os.path.join(folder_path, "**", "*.csv") # "**" for recursive, "*.csv" for csv files

    # Use glob.glob with recursive=True to find all matching files
    for file_path in glob.glob(pattern, recursive=True):
        file_path = Path(file_path)
        csv_file_paths.append(file_path)

    return csv_file_paths


def get_loc(filepath : Path):
    return re.search(r"(.*)_(.*)_csidata", file.name).group(2)

def get_label(filepath : Path):
    return re.search(r"(.*)_(.*)_csidata", file.name).group(1)

def get_csi_data(filepath : Path):
    df = pd.read_csv(filepath).data
    df = df.apply(lambda x : np.array(ast.literal_eval(x)))
    arr = np.stack(df.values, axis = 0)
    return arr


def find_closest_datetime_index(sorted_datetime_series, target_datetime):
    """
    Finds the index of the datetime in a sorted pandas Series that is closest to a target datetime.

    Args:
        sorted_datetime_series (pd.Series): A sorted pandas Series of datetime objects.
        target_datetime (datetime.datetime): The target datetime to find the closest match for.

    Returns:
        int: The index of the closest datetime in the Series.  Returns -1 if the list is empty or
             if the input is not a pandas Series.

    Raises:
        TypeError: If `sorted_datetime_series` is not a pandas Series or does not contain datetimes.
        TypeError: If `target_datetime` is not a datetime.datetime object.
    """
    if not isinstance(sorted_datetime_series, pd.Series):
        raise TypeError("sorted_datetime_series must be a pandas Series.")

    if sorted_datetime_series.empty:
        return -1

    if not pd.api.types.is_datetime64_any_dtype(sorted_datetime_series):
        raise TypeError("sorted_datetime_series must contain datetime objects.")

    if not isinstance(target_datetime, pd.Timestamp):
        try:
            target_datetime = pd.to_datetime(target_datetime)  # Convert if possible
        except ValueError:
            raise TypeError("target_datetime must be a datetime.datetime object or convertible to one.")

    # Convert the series to absolute difference in nanoseconds.
    abs_diff = abs(sorted_datetime_series - target_datetime)

    # Find the index of the minimum absolute difference.
    closest_index = abs_diff.idxmin()

    return closest_index
    

In [7]:
filepath_csi_raw = Path("CSI_raw")
csv_files = get_csv_files(filepath_csi_raw)

csi_df = pd.DataFrame(columns = ["timestamp", "location", "activity"] + [i for i in range(128)])

for file in csv_files:
    csi_arr = get_csi_data(file)
    timestamp = pd.read_csv(file).datetime
    location = get_loc(file)
    activity = get_label(file)
    
    file_df = pd.DataFrame(csi_arr)
    file_df["timestamp"] = timestamp
    file_df["location"] = location
    file_df["activity"] = activity
    
    csi_df = pd.concat((csi_df, file_df), ignore_index  = True)
    
    
    
csi_df



Unnamed: 0,timestamp,location,activity,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127
0,2025-03-01 12:15:41.120686,ly8room,standing,82,-96,4,0,18,-13,13,...,34,-22,35,-21,36,-18,35,-17,34,-15
1,2025-03-01 12:15:41.136687,ly8room,standing,83,-80,4,0,20,-1,17,...,36,-4,37,-3,36,-1,35,0,33,1
2,2025-03-01 12:15:41.153694,ly8room,standing,83,-80,4,0,17,13,16,...,30,21,30,23,29,24,27,24,25,24
3,2025-03-01 12:15:41.177624,ly8room,standing,83,-80,4,0,-13,-21,-14,...,-24,-31,-23,-33,-22,-33,-20,-33,-18,-33
4,2025-03-01 12:15:41.206498,ly8room,standing,83,-80,4,0,11,-4,8,...,18,-8,19,-7,19,-6,18,-5,18,-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10549,2025-03-03 17:53:46.575742,MAKERSPACE,walking,83,-80,4,0,21,22,18,...,34,21,37,24,39,26,38,29,38,30
10550,2025-03-03 17:53:46.592644,MAKERSPACE,walking,83,-80,4,0,-6,34,-3,...,2,41,2,46,0,49,-1,51,-3,53
10551,2025-03-03 17:53:46.628752,MAKERSPACE,walking,83,-80,4,0,-4,-25,-4,...,-7,-27,-8,-31,-8,-33,-7,-35,-7,-36
10552,2025-03-03 17:53:46.655198,MAKERSPACE,walking,83,-80,4,0,16,-31,11,...,14,-33,16,-38,18,-41,19,-43,20,-45


In [89]:

# Parsing HAR Data

filepath_csi_raw = Path("data_collection/makerspace_csi_har_dataset/train")
csv_files = get_csv_files(filepath_csi_raw)

csi_df = pd.DataFrame(columns = ["timestamp", "session_id", "motion"] + [i for i in range(128)])


session_id = 0
for file in csv_files:
    try:
        
        csi_arr = get_csi_data(file)
        timestamp = pd.read_csv(file).datetime
        file_df = pd.DataFrame(csi_arr)
        file_df["timestamp"] = timestamp
        file_df["session_id"] = session_id
        
        csi_df = pd.concat((csi_df, file_df), ignore_index  = True)
        session_id += 1
    except:
        print(f"Failed parsing {file}")
    
csi_df["timestamp"] = pd.to_datetime(csi_df["timestamp"])

csi_df["motion"] = "standing"

for file in csv_files:
    try:
        file_df = pd.read_csv(file)
        file_df["Start Time"] = pd.to_datetime(file_df["Start Time"])
        file_df["Stop Time"] = pd.to_datetime(file_df["Stop Time"])
        
        
        
        for i, row in file_df.iterrows():
            start_time = row["Start Time"]
            stop_time = row["Stop Time"]
            start_index = find_closest_datetime_index(csi_df.timestamp, start_time)
            stop_index = find_closest_datetime_index(csi_df.timestamp, stop_time)
            # print(start_index, stop_index, row["Motion"])
            csi_df.loc[start_index : stop_index + 1, "motion"] = row["Motion"]
            # print(csi_df.loc[start_index : stop_index + 1, "motion"])
            
            
    except:
        print(f"Failed read annotation file {file}")
        




Failed parsing data_collection\makerspace_csi_har_dataset\train\Abel-Training-motion_times.csv
Failed parsing data_collection\makerspace_csi_har_dataset\train\Ivan-Training-motion_times.csv
Failed parsing data_collection\makerspace_csi_har_dataset\train\Matt-Training-motion_times.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\0.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\1.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\2.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\3.csv


In [94]:
csi_df.to_csv("makerspace_data/train.csv")

In [93]:
csi_df.motion.value_counts().standing/ len(csi_df.motion)

np.float64(0.6936050369842837)

In [100]:
time_sum = pd.Timedelta(0)

for file in csv_files:
    try:
        file_df = pd.read_csv(file)
        file_df["Start Time"] = pd.to_datetime(file_df["Start Time"])
        file_df["Stop Time"] = pd.to_datetime(file_df["Stop Time"])
        time_sum += (file_df["Stop Time"] - file_df["Start Time"]).sum()
    except:
        print(f"failed annotation file : {file}")
        

failed annotation file : data_collection\makerspace_csi_har_dataset\train\0.csv
failed annotation file : data_collection\makerspace_csi_har_dataset\train\1.csv
failed annotation file : data_collection\makerspace_csi_har_dataset\train\2.csv
failed annotation file : data_collection\makerspace_csi_har_dataset\train\3.csv


In [101]:
time_sum

Timedelta('0 days 00:09:57.838727')