In [1]:
from fastai.tabular.all import *
import glob
import os
import ast

In [2]:


def get_csv_files(folder_path : Path):
    """
    Recursively searches a folder and its subfolders to find all file paths
    with the .csv extension using the glob module.

    Args:
        folder_path (str): The path to the folder to search.

    Returns:
        list: A list of strings, where each string is the full path to a CSV file
              found within the folder and its subfolders. Returns an empty list
              if no CSV files are found or if the folder path is invalid.
    """
    csv_file_paths = []
    if not os.path.isdir(folder_path):
        print(f"Error: '{folder_path}' is not a valid directory.")
        return csv_file_paths  # Return empty list for invalid path

    # Construct the glob pattern for recursive CSV files
    pattern = os.path.join(folder_path, "**", "*.csv") # "**" for recursive, "*.csv" for csv files

    # Use glob.glob with recursive=True to find all matching files
    for file_path in glob.glob(pattern, recursive=True):
        file_path = Path(file_path)
        csv_file_paths.append(file_path)

    return csv_file_paths


def get_loc(filepath : Path):
    return re.search(r"(.*)_(.*)_csidata", file.name).group(2)

def get_label(filepath : Path):
    return re.search(r"(.*)_(.*)_csidata", file.name).group(1)

def get_csi_data(filepath : Path):
    df = pd.read_csv(filepath).data
    df = df.apply(lambda x : np.array(ast.literal_eval(x)))
    arr = np.stack(df.values, axis = 0)
    return arr


def find_closest_datetime_index(sorted_datetime_series, target_datetime):
    """
    Finds the index of the datetime in a sorted pandas Series that is closest to a target datetime.

    Args:
        sorted_datetime_series (pd.Series): A sorted pandas Series of datetime objects.
        target_datetime (datetime.datetime): The target datetime to find the closest match for.

    Returns:
        int: The index of the closest datetime in the Series.  Returns -1 if the list is empty or
             if the input is not a pandas Series.

    Raises:
        TypeError: If `sorted_datetime_series` is not a pandas Series or does not contain datetimes.
        TypeError: If `target_datetime` is not a datetime.datetime object.
    """
    if not isinstance(sorted_datetime_series, pd.Series):
        raise TypeError("sorted_datetime_series must be a pandas Series.")

    if sorted_datetime_series.empty:
        return -1

    if not pd.api.types.is_datetime64_any_dtype(sorted_datetime_series):
        raise TypeError("sorted_datetime_series must contain datetime objects.")

    if not isinstance(target_datetime, pd.Timestamp):
        try:
            target_datetime = pd.to_datetime(target_datetime)  # Convert if possible
        except ValueError:
            raise TypeError("target_datetime must be a datetime.datetime object or convertible to one.")

    # Convert the series to absolute difference in nanoseconds.
    abs_diff = abs(sorted_datetime_series - target_datetime)

    # Find the index of the minimum absolute difference.
    closest_index = abs_diff.idxmin()

    return closest_index
    

In [3]:
filepath_csi_raw = Path("CSI_raw")
csv_files = get_csv_files(filepath_csi_raw)

csi_df = pd.DataFrame(columns = ["timestamp", "location", "activity"] + [i for i in range(128)])

for file in csv_files:
    csi_arr = get_csi_data(file)
    timestamp = pd.read_csv(file).datetime
    location = get_loc(file)
    activity = get_label(file)
    
    file_df = pd.DataFrame(csi_arr)
    file_df["timestamp"] = timestamp
    file_df["location"] = location
    file_df["activity"] = activity
    
    csi_df = pd.concat((csi_df, file_df), ignore_index  = True)
    
    
    
csi_df



KeyboardInterrupt: 

In [4]:

# Parsing HAR Data

filepath_csi_raw = Path("data_collection/makerspace_csi_har_dataset/train")
csv_files = get_csv_files(filepath_csi_raw)

csi_df = pd.DataFrame(columns = ["timestamp", "session_id", "motion"] + [i for i in range(128)])


session_id = 1
for file in csv_files:
    try:
        
        csi_arr = get_csi_data(file)
        timestamp = pd.read_csv(file).datetime
        file_df = pd.DataFrame(csi_arr)
        file_df["timestamp"] = timestamp
        file_df["session_id"] = session_id
        
        csi_df = pd.concat((csi_df, file_df), ignore_index  = True)
        session_id += 1
    except:
        print(f"Failed parsing {file}")
    
csi_df["timestamp"] = pd.to_datetime(csi_df["timestamp"])

csi_df["motion"] = "standing"

for file in csv_files:
    try:
        file_df = pd.read_csv(file)
        file_df["Start Time"] = pd.to_datetime(file_df["Start Time"])
        file_df["Stop Time"] = pd.to_datetime(file_df["Stop Time"])
        
        
        
        for i, row in file_df.iterrows():
            start_time = row["Start Time"]
            stop_time = row["Stop Time"]
            start_index = find_closest_datetime_index(csi_df.timestamp, start_time)
            stop_index = find_closest_datetime_index(csi_df.timestamp, stop_time)
            # print(start_index, stop_index, row["Motion"])
            csi_df.loc[start_index : stop_index + 1, "motion"] = row["Motion"]
            # print(csi_df.loc[start_index : stop_index + 1, "motion"])
            
            
    except:
        print(f"Failed read annotation file {file}")
        




Failed parsing data_collection\makerspace_csi_har_dataset\train\Abel-Training-motion_times.csv
Failed parsing data_collection\makerspace_csi_har_dataset\train\Ivan-Training-motion_times.csv
Failed parsing data_collection\makerspace_csi_har_dataset\train\Matt-Training-motion_times.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\1.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\2.csv
Failed read annotation file data_collection\makerspace_csi_har_dataset\train\3.csv


In [6]:
csi_df.to_csv("makerspace_data/train.csv")

In [5]:
csi_df.motion.value_counts().standing/ len(csi_df.motion)

np.float64(0.6743057706204191)

In [100]:
time_sum = pd.Timedelta(0)

for file in csv_files:
    try:
        file_df = pd.read_csv(file)
        file_df["Start Time"] = pd.to_datetime(file_df["Start Time"])
        file_df["Stop Time"] = pd.to_datetime(file_df["Stop Time"])
        time_sum += (file_df["Stop Time"] - file_df["Start Time"]).sum()
    except:
        print(f"failed annotation file : {file}")
        

failed annotation file : data_collection\makerspace_csi_har_dataset\train\0.csv
failed annotation file : data_collection\makerspace_csi_har_dataset\train\1.csv
failed annotation file : data_collection\makerspace_csi_har_dataset\train\2.csv
failed annotation file : data_collection\makerspace_csi_har_dataset\train\3.csv


In [8]:
df = pd.read_csv("makerspace_data/train.csv", index_col = 0)
df.timestamp = pd.to_datetime(df.timestamp)
df

Unnamed: 0,timestamp,session_id,motion,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127
0,2025-03-20 11:05:46.960575,1,standing,83,-80,4,0,39,-6,39,...,1,2,9,0,16,-1,24,-3,30,-4
1,2025-03-20 11:05:46.993573,1,standing,83,-80,4,0,-11,-31,-11,...,1,-2,-1,-7,-3,-13,-6,-19,-7,-24
2,2025-03-20 11:05:47.027583,1,standing,83,-80,4,0,-25,-22,-25,...,0,-2,-5,-6,-10,-10,-15,-14,-18,-17
3,2025-03-20 11:05:47.058571,1,standing,83,-80,4,0,-23,24,-22,...,-2,-1,-6,4,-11,9,-15,14,-17,17
4,2025-03-20 11:05:47.092578,1,standing,83,-80,4,0,25,-31,25,...,1,0,6,-6,11,-12,16,-18,20,-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58117,2025-03-20 12:00:16.666935,3,standing,83,-80,4,0,2,-25,2,...,3,-21,3,-26,3,-29,3,-32,2,-33
58118,2025-03-20 12:00:16.699927,3,standing,83,-80,4,0,-5,11,-5,...,-5,9,-6,11,-6,13,-6,14,-6,15
58119,2025-03-20 12:00:16.730964,3,standing,83,-80,4,0,3,23,1,...,2,20,3,24,4,28,5,30,6,31
58120,2025-03-20 12:00:16.762969,3,standing,83,-80,4,0,16,-17,14,...,15,-15,18,-18,20,-20,22,-22,22,-24


In [12]:
df.loc[df.session_id == 1].loc[(df.loc[df.session_id == 1].timestamp - df.loc[df.session_id == 1].timestamp[0]) < pd.Timedelta(minutes = 10)]

Unnamed: 0,timestamp,session_id,motion,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127
0,2025-03-20 11:05:46.960575,1,standing,83,-80,4,0,39,-6,39,...,1,2,9,0,16,-1,24,-3,30,-4
1,2025-03-20 11:05:46.993573,1,standing,83,-80,4,0,-11,-31,-11,...,1,-2,-1,-7,-3,-13,-6,-19,-7,-24
2,2025-03-20 11:05:47.027583,1,standing,83,-80,4,0,-25,-22,-25,...,0,-2,-5,-6,-10,-10,-15,-14,-18,-17
3,2025-03-20 11:05:47.058571,1,standing,83,-80,4,0,-23,24,-22,...,-2,-1,-6,4,-11,9,-15,14,-17,17
4,2025-03-20 11:05:47.092578,1,standing,83,-80,4,0,25,-31,25,...,1,0,6,-6,11,-12,16,-18,20,-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18341,2025-03-20 11:15:46.797798,1,standing,83,-80,4,0,-23,20,-23,...,7,5,2,6,-4,7,-10,9,-14,11
18342,2025-03-20 11:15:46.833801,1,standing,83,-80,4,0,17,21,19,...,7,-8,7,-3,8,2,9,8,10,11
18343,2025-03-20 11:15:46.861797,1,standing,83,-80,4,0,13,18,15,...,11,-4,10,-1,9,3,9,7,10,9
18344,2025-03-20 11:15:46.895797,1,standing,83,-80,4,0,-14,7,-14,...,0,6,-2,6,-4,6,-7,5,-9,6


In [13]:
df.loc[df.session_id == 1]

Unnamed: 0,timestamp,session_id,motion,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127
0,2025-03-20 11:05:46.960575,1,standing,83,-80,4,0,39,-6,39,...,1,2,9,0,16,-1,24,-3,30,-4
1,2025-03-20 11:05:46.993573,1,standing,83,-80,4,0,-11,-31,-11,...,1,-2,-1,-7,-3,-13,-6,-19,-7,-24
2,2025-03-20 11:05:47.027583,1,standing,83,-80,4,0,-25,-22,-25,...,0,-2,-5,-6,-10,-10,-15,-14,-18,-17
3,2025-03-20 11:05:47.058571,1,standing,83,-80,4,0,-23,24,-22,...,-2,-1,-6,4,-11,9,-15,14,-17,17
4,2025-03-20 11:05:47.092578,1,standing,83,-80,4,0,25,-31,25,...,1,0,6,-6,11,-12,16,-18,20,-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21390,2025-03-20 11:17:26.787997,1,standing,83,-80,4,0,-20,-20,-18,...,-11,-20,-14,-23,-17,-25,-19,-26,-21,-27
21391,2025-03-20 11:17:26.822003,1,standing,83,-80,4,0,-28,14,-23,...,-25,4,-29,6,-33,9,-35,11,-37,13
21392,2025-03-20 11:17:26.854053,1,standing,83,-80,4,0,26,-17,22,...,22,-6,27,-9,30,-12,32,-14,34,-16
21393,2025-03-20 11:17:26.888999,1,standing,83,-80,4,0,-24,-19,-21,...,-11,-17,-15,-20,-18,-23,-22,-25,-24,-26


In [17]:
df.drop(np.arange(18346, 21395)).reset_index(drop = True).to_csv("makerspace_data/train.csv")