In [1]:
# read dataLog00000.TXT file

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import struct
from pathlib import Path
import os
import json

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [2]:

def get_values_from_bin_file(bin_file_pointer, nr_bytes, format): 
    #----------------------timestamp
    data_byte_1 = bin_file_pointer.read(nr_bytes)
    if len(data_byte_1)==0:
        return False,bin_file_pointer
    value, = struct.unpack(format, data_byte_1)
        
    return value,bin_file_pointer

def get_params_from_bin_file(bin_file_pointer,nr_sources,sources_names):
    # create dictionary for data from each source
    sources_dic = {}
    # for each source get sample rate, sensitivity, sample group
    for source in sources_names:
        freq_set,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 2, '>H')
        sensitivity,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 2, '>H')
        sensitivity = sensitivity/1000
        scale,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 2, '>H')
        sample_group_nr,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 2, '>H')
        sources_dic[source] = {'freq_set':freq_set, 'sensitivity':sensitivity, 'sample_group_nr':sample_group_nr, 'scale':scale}

    return sources_dic,bin_file_pointer

def get_logger_file_nr_from_bin_file(filename: str):
    # find 'id_' and read the number after it
    for i in range(len(filename)):
        if filename[i:i+3] == 'id_':
            logger_id = int(filename[i+3:i+7])
            break
    # find 'nr_' and read the number after it
    for i in range(len(filename)):
        if filename[i:i+3] == 'nr_':
            logger_file_nr = int(filename[i+3:i+7])
            break
    return [logger_id, logger_file_nr]

def get_header_from_bin_file(bin_file_pointer):
    # read header until '/&/' is found
    header = ''
    while True:
        data_byte_1,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 1, 'c')
        if data_byte_1 == b'':
            print('End of file reached')
            break
        if data_byte_1 == b'/':
            data_byte_2,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 1, 'c')
            if data_byte_2 == b'&':
                data_byte_3,bin_file_pointer = get_values_from_bin_file(bin_file_pointer, 1, 'c')
                if data_byte_3 == b'/':
                    break
        header += data_byte_1.decode('utf-8')

    # split header with ',' into list of strings
    header = header.split(',')

    return header,bin_file_pointer

def create_logger_id_parameters_dict(logger_id : int,sources_names : list[str], sources_units_dict: dict, curr_file_sources_dic: dict):
    dict_logger_id = {
        logger_id: {
            'dictionary_type':'dataset-from-logger_id',
            'value_type':'dictionary',
            'labels':{
                'dictionary_type':'dataset_parameter',
                'value_type':'dictionary-of-the-labels',
                'value_unit':'',
                'value':{
                    0: {
                        'dictionary_type': 'label',
                        'value_type': 'dictionary-of-label',
                        'value_unit': '',
                        'description': 'Label-0',
                        'nr_files': 0,
                    },
                    1: {
                        'dictionary_type': 'label',
                        'value_type': 'dictionary-of-label',
                        'value_unit': '',
                        'description': 'Label-1',
                        'nr_files': 0,
                    },
                    2: {
                        'dictionary_type': 'label',
                        'value_type': 'dictionary-of-label',
                        'value_unit': '',
                        'description': 'Label-2',
                        'nr_files': 0,
                    }
                },
                'description':'dataset-labels',
            },
            'nr_files':{
                'dictionary_type':'dataset_parameter',
                'value_type':'int',
                'value_unit':'',
                'value':0,
                'description':'number-of-files',
            },
            'total_measured_time':{
                'dictionary_type':'dataset_parameter',
                'value_type':'int',
                'value_unit':'seconds',
                'value':0,
                'description':'total-measured-time-of-the-dataset',
            },
            'sources':{
                'dictionary_type':'dataset_parameter',
                'value_type':'dictionary-of-dictionaries-sources',
                'value_unit':'',
                'value':{
                    source: {
                        'dictionary_type': 'source',
                        'value_type': 'dictionary-of-source parameters',
                        **{f"value_unit": sources_units_dict[source_key] for source_key in sources_units_dict.keys() if source_key in source},
                        'sampling_frequency_set': {
                            'dictionary_type': 'source_parameter',
                            'value_type': 'int',
                            'value_unit': '[Hz]',
                            'value': curr_file_sources_dic[source]['freq_set'],
                            'description': 'sampling-frequency-set-in-the-logger',
                        },
                        'sampling_frequency_real': {
                            'dictionary_type': 'source_parameter',
                            'value_type': 'int',
                            'value_unit': '[Hz]',
                            'value': 0,
                            'description': 'real-sampling-frequency-of-the-source, number-of-samples-per-second',
                        },
                        'samples_nr': {
                            'dictionary_type': 'source_parameter',
                            'value_type': 'int',
                            'value_unit': '',
                            'value': 0,
                            'description': 'number-of-samples',
                        },
                        'samples_nr_bytes': {
                            'dictionary_type': 'source_parameter',
                            'value_type': 'int',
                            'value_unit': 'bytes',
                            'value': -1,
                            'description': 'number-of-bytes-of-each-sample',
                        },
                        'scale': {
                            'dictionary_type': 'source_parameter',
                            'value_type': 'int',
                            **{f"value_unit": sources_units_dict[source_key] for source_key in sources_units_dict.keys() if source_key in source},
                            'value': curr_file_sources_dic[source]['scale'],
                            'description': 'scale-of-the-source',
                        }
                    } for source in sources_names       
                },
            }
        }
    } 

    return dict_logger_id

def remove_chars_from_string(filename: str,char_to_remove: list[str]):
    for char_to_remove in char_to_remove:
        filename = filename.replace(char_to_remove, "")
    return filename

def sort_list(list1, list2):
 
    zipped_pairs = zip(list2, list1)
 
    z = [x for _, x in sorted(zipped_pairs)]
 
    return z

def list2wave(readings_list,wave_wave_file_path_and_name,nr_channels,freq,sample_size):

    np_readings_list=np.array(readings_list)

    sample_size=2
    
    obj = wave.open(wave_wave_file_path_and_name,'w')
    obj.setnchannels(nr_channels)
    obj.setsampwidth(sample_size)
    obj.setframerate(freq)
    for sample in np_readings_list:
        if sample_size==2:
            aux_data = struct.pack('<h', sample)
        elif sample_size==4:    
            aux_data = struct.pack('<f', np.float32(float(sample)))
        else:
            print("Warning! Sample size not supported to make correct Wave files! See list2wave function")
            return

        obj.writeframesraw( aux_data )
    obj.close()

In [3]:
dataset_parameters_dict={}

sources_units_dict={'acc':'[mg]','mic':'[dB]'}

#print sources_units keys
#print(sources_units_dict.keys())

main_dfs_list=[]

dataset_sources_names=[]
dataset_sources_flags_names=[]

for dirname, _, filenames in (os.walk("C:/Users/ricar/OneDrive - Universidade de Lisboa/ObsidianVaults/KeeperOfKnowledge_ObsVault/1_University/1_1_Thesis/1_1_10_Datasets/CNC_iStartLab_Main/CAMs_Gcode_Data/6666Hz/Facing0_0010/Raw")):
    txt_files = [f for f in filenames if f.endswith('.TXT')]
    print(txt_files)
    for filename in (txt_files):
        print(filename)

        # Open the binary file for reading

        bin_file_pointer = open(os.path.join(dirname, filename), "rb")
        important_input_vals=[]

        logger_ids = get_logger_file_nr_from_bin_file(filename)

        curr_file_logger_id = logger_ids[0]
        curr_file_logger_file_nr = logger_ids[1]

        #get header from bin file
        curr_file_header,_ = get_header_from_bin_file(bin_file_pointer)
        #get sources from header (first element is 'timestamp', second is the button , third is flags source) the rest the source data
        curr_file_sources_names = curr_file_header[3:]
        curr_file_nr_sources = int(len(curr_file_sources_names))

        curr_file_sources_dic,_ = get_params_from_bin_file(bin_file_pointer,curr_file_nr_sources,curr_file_sources_names)
            
        # get the maximum sample group number from all the sources
        max_sample_group_nr = 0
        for source in curr_file_sources_names:
            if curr_file_sources_dic[source]['sample_group_nr'] > max_sample_group_nr:
                max_sample_group_nr = curr_file_sources_dic[source]['sample_group_nr']
        
        curr_file_sources_flags_names = ['flag_' + source for source in curr_file_sources_names]

        # create dataframe with the timestamp, logger_id, file_nr and sources names
        columns_list = ['timestamp', 'logger_id','logger_file_nr','label']
        columns_list.extend(curr_file_sources_flags_names)
        columns_list.extend(curr_file_sources_names)
        df = pd.DataFrame(columns=columns_list)

        dataset_sources_names.extend(curr_file_sources_names)
        dataset_sources_flags_names.extend(curr_file_sources_flags_names)
        curr_file_sources_counter = {source: 0 for source in curr_file_sources_names}

        # create dictionary with the flags of the sources
        curr_file_sources_vals_dict = {
            source: [] for source in curr_file_sources_names
        }

        curr_file_repeated_params_dict = {    
            flag_name: [] for flag_name in curr_file_sources_flags_names
        }
        curr_file_repeated_params_dict['timestamp'] = []
        curr_file_repeated_params_dict['logger_id'] = []
        curr_file_repeated_params_dict['logger_file_nr'] = []
        curr_file_repeated_params_dict['label'] = []
        
        while (1):
    
            curr_row_timestamp,_ = get_values_from_bin_file(bin_file_pointer, 4, '>L')
            if curr_row_timestamp is False:
                break

            curr_row_label,_ = get_values_from_bin_file(bin_file_pointer, 1, '>B')

            #----------------------dataset_sensor_flags
            curr_row_flags,_ = get_values_from_bin_file(bin_file_pointer, 1, '>B')
            curr_row_flags = [bool(curr_row_flags >> i & 1) for i in range(curr_file_nr_sources)]

            for sample_nr in range(max_sample_group_nr):
                for i,flag_name in enumerate(curr_file_sources_flags_names):
                    curr_file_repeated_params_dict[flag_name].append(curr_row_flags[i])
                curr_file_repeated_params_dict['timestamp'].append(curr_row_timestamp)
                curr_file_repeated_params_dict['logger_id'].append(curr_file_logger_id)
                curr_file_repeated_params_dict['logger_file_nr'].append(curr_file_logger_file_nr)
                curr_file_repeated_params_dict['label'].append(curr_row_label)

            # counter of samples for each source
            sample_group_nr = {source: 0 for source in curr_file_sources_names}

            for sample_nr in range(max_sample_group_nr):
                for i,source in enumerate(curr_file_sources_names):
                    if (curr_row_flags[i] and sample_group_nr[source] < curr_file_sources_dic[source]['sample_group_nr']):
                        aux_sources_vals,_ = get_values_from_bin_file(bin_file_pointer, 2, '>h')
                        curr_file_sources_vals_dict[source].append(round(aux_sources_vals*curr_file_sources_dic[source]['sensitivity'],4))
                        #curr_file_sources_vals_dict[source].append(aux_sources_vals)
                        # increment sample_group_nr for the source
                        sample_group_nr[source] += 1
                        curr_file_sources_counter[source] += 1
                    else:
                        curr_file_sources_vals_dict[source] = (np.nan)
                        
        aux_df = pd.DataFrame({**curr_file_repeated_params_dict, **curr_file_sources_vals_dict})
        df = pd.concat([df, aux_df], ignore_index=True)
        # print first 5 rows of dataframe
        print(df.head())

        bin_file_pointer.close()
        
        unique_file_labels = pd.unique(df['label'])
        time_curr_file = (df['timestamp'][len(df)-1]-df['timestamp'][0])/1000
        main_dfs_list.append(df)

        # check if logger_id is already in dataset_parameters_dict
        if curr_file_logger_id not in dataset_parameters_dict.keys():
            aux_dataset_parameters_dict = create_logger_id_parameters_dict(curr_file_logger_id,curr_file_sources_names,sources_units_dict,curr_file_sources_dic)
            dataset_parameters_dict.update(aux_dataset_parameters_dict)
        
        # update dataset_parameters_dict
        # create new label key if it doesn't exist in dataset_parameters_dict
        for label in unique_file_labels:
            if label not in dataset_parameters_dict[curr_file_logger_id]['labels']['value'].keys():
                new_label_dict = { label: { 'nr_files': 0 } }
                dataset_parameters_dict[curr_file_logger_id]['labels']['value'].update(dict(new_label_dict))
            dataset_parameters_dict[curr_file_logger_id]['labels']['value'][label]['nr_files'] = dataset_parameters_dict[curr_file_logger_id]['labels']['value'][label]['nr_files'] + 1
        dataset_parameters_dict[curr_file_logger_id]['nr_files']['value'] = dataset_parameters_dict[curr_file_logger_id]['nr_files']['value'] + 1
        dataset_parameters_dict[curr_file_logger_id]['total_measured_time']['value'] = dataset_parameters_dict[curr_file_logger_id]['total_measured_time']['value'] + time_curr_file
        for source in curr_file_sources_names:
            dataset_parameters_dict[curr_file_logger_id]['sources']['value'][source]['sampling_frequency_real']['value'] = dataset_parameters_dict[curr_file_logger_id]['sources']['value'][source]['sampling_frequency_real']['value'] + round(curr_file_sources_counter[source]/time_curr_file,4)
            dataset_parameters_dict[curr_file_logger_id]['sources']['value'][source]['samples_nr']['value'] = dataset_parameters_dict[curr_file_logger_id]['sources']['value'][source]['samples_nr']['value'] + curr_file_sources_counter[source]




# divide values of dataset_parameters_dict by nr_files for all the logger_ids
for logger_id in dataset_parameters_dict.keys():
    for source in dataset_parameters_dict[logger_id]['sources']['value'].keys():
        dataset_parameters_dict[logger_id]['sources']['value'][source]['sampling_frequency_real']['value'] = dataset_parameters_dict[logger_id]['sources']['value'][source]['sampling_frequency_real']['value']/dataset_parameters_dict[logger_id]['nr_files']['value']

# concatenate all the datasets and reset the index
main_df = pd.concat(main_dfs_list).reset_index().rename(columns={'index':'sample_index'})

main_df.columns.str.match("Unnamed")
main_df.loc[:,~main_df.columns.str.match("Unnamed")]

main_df=main_df.replace(r'^\s*$', np.nan, regex=True)
# add columns with flags for each source

# remove columns: sample_index, logger_file_nr, logger_id, timestamp, label
main_df = main_df.drop(columns=['sample_index','logger_file_nr','timestamp','label'])
# remove columns that start with 'flag_'
main_df = main_df.loc[:,~main_df.columns.str.match("flag_")]

# get the logger_id unique values
logger_ids = pd.unique(main_df['logger_id'])
# for each logger id save a csv file and a json file
for logger_id in logger_ids:
    # get the dataset_parameters_dict for the current logger_id
    logger_id_dataset_parameters_dict = dataset_parameters_dict[logger_id]
    # save the dataset_parameters_dict as a json file
    with open('label_'+str(logger_id)+'.json', 'w') as fp:
        json.dump(logger_id_dataset_parameters_dict, fp, indent=4)
    # get the sources names for the current logger_id
    logger_id_sources_names = logger_id_dataset_parameters_dict['sources']['value'].keys()
    # get the dataframe with the current logger_id
    logger_id_df = main_df[main_df['logger_id'] == logger_id]
    logger_id_df.to_csv('label_'+str(logger_id)+'.csv',index=False, columns = logger_id_sources_names)


#melted_dataset_df = main_df.melt(id_vars=id_vars,value_vars=value_vars,var_name='source',value_name='readings')

['dataLog_id_0011_nr_0000.TXT', 'dataLog_id_0011_nr_0001.TXT', 'dataLog_id_0011_nr_0002.TXT', 'dataLog_id_0011_nr_0003.TXT', 'dataLog_id_0011_nr_0004.TXT', 'dataLog_id_0011_nr_0005.TXT', 'dataLog_id_0011_nr_0006.TXT', 'dataLog_id_0011_nr_0007.TXT', 'dataLog_id_0012_nr_0000.TXT', 'dataLog_id_0012_nr_0001.TXT', 'dataLog_id_0012_nr_0002.TXT', 'dataLog_id_0012_nr_0003.TXT', 'dataLog_id_0012_nr_0004.TXT', 'dataLog_id_0012_nr_0005.TXT', 'dataLog_id_0012_nr_0006.TXT', 'dataLog_id_0020_nr_0000.TXT', 'dataLog_id_0020_nr_0001.TXT', 'dataLog_id_0020_nr_0002.TXT', 'dataLog_id_0020_nr_0003.TXT', 'dataLog_id_0020_nr_0004.TXT', 'dataLog_id_0020_nr_0005.TXT', 'dataLog_id_0020_nr_0006.TXT', 'dataLog_id_0020_nr_0007.TXT', 'dataLog_id_0020_nr_0008.TXT', 'dataLog_id_0020_nr_0009.TXT', 'dataLog_id_0020_nr_0010.TXT', 'dataLog_id_0020_nr_0011.TXT', 'dataLog_id_0020_nr_0012.TXT', 'dataLog_id_0020_nr_0013.TXT', 'dataLog_id_0020_nr_0014.TXT', 'dataLog_id_0020_nr_0015.TXT', 'dataLog_id_0020_nr_0016.TXT', 'dataLo

In [None]:
print(main_df.to_markdown())

In [10]:
pp.pprint(dataset_parameters_dict)

{   11: {   'dictionary_type': 'dataset-from-logger_id',
            'labels': {   'description': 'dataset-labels',
                          'dictionary_type': 'dataset_parameter',
                          'value': {   0: {   'description': 'Label-0',
                                              'dictionary_type': 'label',
                                              'nr_files': 8,
                                              'value_type': 'dictionary-of-label',
                                              'value_unit': ''},
                                       1: {   'description': 'Label-1',
                                              'dictionary_type': 'label',
                                              'nr_files': 0,
                                              'value_type': 'dictionary-of-label',
                                              'value_unit': ''},
                                       2: {   'description': 'Label-2',
                                    

In [6]:
bin_file_pointer.close()
