# Combine CSI and Light Data

In [1]:
import pandas as pd
import numpy as np
import glob
import os

In [2]:
# get number of files in a directory which start with the activity's name
def get_activity_filecount(dir, activity):
    if not os.path.exists(dir):
        raise Exception(f"Directory {dir} does not exist")
    return len(glob.glob(f"{dir}/{activity}*.csv"))

In [3]:
act_dirs = [('dataset/train/new_csi_data', 'dataset/train/new_light_data'), ('dataset/train/csi_data', 'dataset/train/light_data')]
activities = ['empty', 'sit', 'stand', 'walk']

In [4]:
# check number of files in each directory
for d_pair in act_dirs:
    for activity in activities:
        print(d_pair, activity, get_activity_filecount(d_pair[0], activity), get_activity_filecount(d_pair[1], activity))

('dataset/train/new_csi_data', 'dataset/train/new_light_data') empty 1 1
('dataset/train/new_csi_data', 'dataset/train/new_light_data') sit 15 15
('dataset/train/new_csi_data', 'dataset/train/new_light_data') stand 15 15
('dataset/train/new_csi_data', 'dataset/train/new_light_data') walk 15 15
('dataset/train/csi_data', 'dataset/train/light_data') empty 4 4
('dataset/train/csi_data', 'dataset/train/light_data') sit 9 9
('dataset/train/csi_data', 'dataset/train/light_data') stand 9 9
('dataset/train/csi_data', 'dataset/train/light_data') walk 9 9


In [5]:
def combine_data(fname, csi_dir, lgt_dir, lgt_has_header):
    """ Combine CSI and light data into a single pandas dataframe
    """
    csi_df = pd.read_csv(f"{csi_dir}/{fname}").dropna(axis=1) # read in csi data and remove columns with all nulls
    if lgt_has_header:
        lgt_df = pd.read_csv(f"{lgt_dir}/{fname}")
    else:
        lgt_df = pd.read_csv(f"{lgt_dir}/{fname}", header=None, names=['msg_received_datetime', 'msg_published_datetime', 'sensor_id', 'sensor_val'])

    # parse to pandas datetime object and floor to closest 100 millisecond
    csi_df['time'] = pd.to_datetime(csi_df['time'], unit='s', utc=True).dt.tz_convert('Asia/Kuala_Lumpur').dt.floor('100ms')
    lgt_df['msg_received_datetime'] = pd.to_datetime(lgt_df['msg_received_datetime'], format='%Y-%m-%d %H:%M:%S').dt.tz_localize('Asia/Kuala_Lumpur').dt.floor('100ms')
    lgt_df['msg_published_datetime'] = pd.to_datetime(lgt_df['msg_published_datetime'], format='%Y-%m-%d %H:%M:%S.%f').dt.tz_localize('Asia/Kuala_Lumpur').dt.floor('100ms')

    # match csi start time to light start time due to issue in Raspberry Pi
    csi_min_time = csi_df['time'].min()
    lgt_min_time = lgt_df['msg_published_datetime'].min()
    csi_df['time'] = csi_df['time']  + (lgt_min_time - csi_min_time)

    # convert complex numbers to absolute values
    for idx in range(0, 64):
        csi_col = f'_{idx}'
        csi_df[csi_col] = csi_df[csi_col].apply(lambda x: np.abs(np.complex128(x)))

    # convert light data from long to wide format
    LIGHT_COLNAMES = ['sensor_' + str (i) for i in range(1,10)]
    lgt_wide_df = lgt_df.pivot_table(index="msg_published_datetime", columns="sensor_id", values="sensor_val", aggfunc="mean").add_prefix("sensor_").reset_index()
    # fill null values with previous data, then subsequent data if needed
    lgt_wide_df.loc[:, LIGHT_COLNAMES] =  lgt_wide_df.loc[:, LIGHT_COLNAMES].ffill().bfill()

    # merge csi and light data on timestamp
    merged_df = csi_df.merge(lgt_wide_df, how='inner', left_on='time', right_on='msg_published_datetime')
    return merged_df

In [6]:
def batch_rows(df, rows_per_group):
    CSI_COLNAMES = [f'_{idx}' for idx in range(0, 64)]
    LIGHT_COLNAMES = ['sensor_' + str (i) for i in range(1,10)]

    res = []
    total_groups = df.shape[0]//rows_per_group
    count = 0
    while count < total_groups:
        temp = df.iloc[count*rows_per_group:(count+1)*rows_per_group, :]
        temp = temp[CSI_COLNAMES+LIGHT_COLNAMES].values
        if not np.isnan(np.min(temp)):
            res.append(temp)
        count += 1
    return res

In [7]:
def combine_csi_light_in_dir(csi_dir, light_dir, output_dir, lgt_has_header=True):
    """ Combine CSI and light data in separate directories and output to specified directory
    """
    activities = ['empty', 'sit', 'stand', 'walk']

    # check if output directory exists, if not throw error
    if not os.path.exists(csi_dir):
        raise Exception(f"CSI directory {csi_dir} does not exist")
    elif not os.path.exists(light_dir):
        raise Exception(f"Light directory {light_dir} does not exist")
    
    # create output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for activity in activities:
        counter = get_activity_filecount(output_dir, activity)
        total_files = get_activity_filecount(csi_dir, activity)
        print(f"Processing {activity} files. Total files: {total_files}")
        for i in range(1, total_files+1):
            fname = f'{activity}_{i}.csv'
            try:
                res = combine_data(fname, csi_dir, light_dir, lgt_has_header)
            except Exception as e:
                print(e)
                continue
            batched_res = batch_rows(res, 100)
            for batch in batched_res:
                np.savetxt(f"{output_dir}/{activity}_{counter}.csv", batch, delimiter=",")
                counter += 1
            print(f"dirs: {(csi_dir, light_dir)}, file name: {fname}, file shape: {res.shape}, total output files: {len(batched_res)}")

process data from first pair of directories

In [8]:
combine_csi_light_in_dir('dataset/train/new_csi_data', 'dataset/train/new_light_data', 'dataset/example_train_combined', lgt_has_header=True)

Processing empty files. Total files: 1
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: empty_1.csv, file shape: (2167, 76), total output files: 21
Processing sit files. Total files: 15
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: sit_1.csv, file shape: (328, 76), total output files: 3
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: sit_2.csv, file shape: (330, 76), total output files: 3
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: sit_3.csv, file shape: (324, 76), total output files: 3
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: sit_4.csv, file shape: (342, 76), total output files: 3
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: sit_5.csv, file shape: (326, 76), total output files: 3
dirs: ('dataset/train/new_csi_data', 'dataset/train/new_light_data'), file name: sit_6.csv, file 

process data from second pair of directories

In [10]:
combine_csi_light_in_dir('dataset/train/csi_data', 'dataset/train/light_data', 'dataset/example_train_combined', lgt_has_header=False)

Processing empty files. Total files: 4
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: empty_1.csv, file shape: (272, 76), total output files: 2
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: empty_2.csv, file shape: (272, 76), total output files: 2
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: empty_3.csv, file shape: (269, 76), total output files: 2
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: empty_4.csv, file shape: (273, 76), total output files: 2
Processing sit files. Total files: 9
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: sit_1.csv, file shape: (267, 76), total output files: 2
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: sit_2.csv, file shape: (270, 76), total output files: 2
dirs: ('dataset/train/csi_data', 'dataset/train/light_data'), file name: sit_3.csv, file shape: (271, 76), total output files: 2
dirs: ('datas