In [1]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
from tqdm.notebook import trange, tqdm
from datetime import datetime
import wave
import scipy.signal

from conflab.constants import (
    raw_audio_path,
    raw_wearables_path
)
from utils import (
    get_audio_chunks, 
    get_accel_chunks, 
    AudioFile
)

In [2]:
# folder with accel files only from midge data
# midge_accel_path = '/mnt/e/data/conflab/accel/midge'
# midge_accel = os.listdir(midge_accel_path)

## Generate correspondance files

The following cell will generate files containing a log of the missing data segments in both accel and audio raw files.

Because the device malfunction affected both streams, the holes can be associated to synchronize the audio data using the timestamps in the accel data at start / end times of these holes.

In [5]:
correspondances_path = os.path.join(raw_audio_path, 'sync_files')
if not os.path.exists(correspondances_path):
    os.mkdir(correspondances_path)

In [4]:
midge_raw_data_path = os.path.join(raw_wearables_path, 'raw')
for midge_num in os.listdir(midge_raw_data_path):
    exp_folder = os.path.join(correspondances_path, midge_num)
    if not os.path.exists(exp_folder):
        os.mkdir(exp_folder)

    data_folder = os.listdir(os.path.join(midge_raw_data_path, midge_num))[-1]
    midge_files = os.listdir(os.path.join(midge_raw_data_path, midge_num, data_folder))
    audio_fname = [f for f in midge_files if f.endswith('_audio_1')][-1]
    audio_fname = os.path.join(midge_raw_data_path, midge_num, data_folder, audio_fname)

    accel_fname = [f for f in midge_files if f.endswith('_accel.csv')][-1]
    accel_fname = os.path.join(midge_raw_data_path, midge_num, data_folder, accel_fname)
    # accel_fname = list(filter(lambda f: f.startswith(f'{midge_num}_'), midge_accel))[-1]
    # accel_fname = os.path.join(midge_accel_path, accel_fname)

    # audio holes
    audio_file = AudioFile(audio_fname)
    good_chunks, repeat_chunks = audio_file.get_repeats()
    # good_chunks, repeat_chunks = get_audio_chunks(audio_fname)
    first = np.array([chunk['first'] for chunk in repeat_chunks])
    last = np.array([chunk['last'] for chunk in repeat_chunks])
    tfirst = np.array([chunk['tfirst'] for chunk in repeat_chunks])
    tlast = np.array([chunk['tlast'] for chunk in repeat_chunks])
    df = pd.DataFrame()
    df['first'] = first
    df['last'] = last
    df['len'] = (last - first)
    df['tfirst'] = tfirst
    df['tlast'] = tlast
    df.to_csv(os.path.join(exp_folder, 'audio_holes.csv'))

    # accel holes
    accel_holes = get_accel_chunks(accel_fname)
    accel_holes.to_csv(os.path.join(exp_folder, 'accel_holes.csv'))

    # do the matching
    corr = df.copy()
    corr['accel_first'] = None
    corr['accel_last'] = None

    af = df['tfirst'].iat[0]
    al = df['tlast'].iat[-1]
    xf = accel_holes['time_first'].iat[0]
    xl = accel_holes['time_last'].iat[-1]

    # print((af, al, xf, xl))

    for i, ah in accel_holes.iterrows():
        
        aft = af + (al - af) * (ah['time_first'] - xf) / (xl - xf)
        alt = af + (al - af) * (ah['time_last'] - xf) / (xl - xf)

        af_idx = corr['tfirst'].sub(aft).abs().idxmin()
        al_idx = corr['tfirst'].sub(alt).abs().idxmin()

        corr['accel_first'][af_idx] = ah['time_first']
        corr['accel_last'][al_idx] = ah['time_last']

    corr.to_csv(os.path.join(exp_folder, 'correspondance.csv'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr['accel_first'][af_idx] = ah['time_first']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr['accel_last'][al_idx] = ah['time_last']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr['accel_first'][af_idx] = ah['time_first']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr['accel_last'][al_idx] = 

KeyboardInterrupt: 

# Correction code

After manual correction, the correspondance files can be used to generate corrected audio files

In [6]:
def get_annot_chunks(correspondances, af):

    
    good_chunks, repeat_chunks = af.get_repeats()

    i = 0
    annotated_good_chunks = []
    while i < len(correspondances)-1:
        # if not np.isnan(correspondances.loc[i, 'accel_last']):
        while np.isnan(correspondances.loc[i, 'accel_last']) and i < len(correspondances) - 2:
            i += 1

        

        accel_first = correspondances.loc[i, 'accel_last']
        chunks = []
        i += 1
        if i == len(correspondances) - 1:
            break
        
        chunks.append(good_chunks[i])

        while np.isnan(correspondances.loc[i, 'accel_first']) and i < len(correspondances) - 1:
            i += 1
            chunks.append(good_chunks[i])

        accel_last = correspondances.loc[i, 'accel_first']

        # if len(chunks) >= 2:
        annotated_good_chunks.append({
            'audio_first': chunks[0]['tfirst'],
            'audio_last': chunks[-1]['tlast'],
            'audio_first_sample': chunks[0]['first'],
            'audio_last_sample': chunks[-1]['last'],
            'accel_first': accel_first,
            'accel_last': accel_last,
            'chunks': chunks
        })
    return annotated_good_chunks

In [7]:
def write_samples(fout, samples, num_samples):
    bytes_samples = b''.join(samples)
    samples = [int.from_bytes(bytes_samples[i:i + 2], 'little', signed=True)
               for i in range(0, len(bytes_samples), 2)]
    signal = scipy.signal.resample(samples, num_samples)
    
    quantized = list()
    for el in signal:
        # −32,768 to 32,767
        int_el = max(-32768, min(32767, int(el)))
        int_bytes = int_el.to_bytes(2, 'little', signed=True)
        quantized.append(int_bytes)
    
    quant_bytes = b''.join(quantized)
    fout.writeframesraw(quant_bytes)

    return num_samples

In [8]:
def write_blank_until(fout, num_written, sec):
    num_to_write = int(sec * 1250) - num_written
    assert num_to_write > 0, f'num_to_write is {num_to_write}'
    fout.writeframesraw(bytearray(num_to_write * 2))
    return num_to_write # num of samples written


In [9]:
def correct_subject_audio(af, fout_name, chunks):
    with wave.open(fout_name, 'wb') as fout:
        fout.setnchannels(1)
        fout.setsampwidth(2)
        fout.setframerate(1250)

        prev_intval = None
        num_written = 0
        for chunk in chunks:
            print(
                f'{"*"*20} \
accel: {chunk["accel_first"]} - {chunk["accel_last"]} \
audio: {chunk["audio_first"]} - {chunk["audio_last"]}\
 ({chunk["audio_first_sample"]} - {chunk["audio_last_sample"]})')
            # if chunk['audio'][1] == -1 or chunk['accel'][1] == -1:
            #     break

            # write blank data in the hole
            num_written += write_blank_until(
                fout, num_written, chunk['accel_first'])
            print(f'blank: {num_written}')

            # fill the good chunk with interpolated data
            num_written += write_samples(
                fout, 
                af.get_chunk(chunk['audio_first_sample'], chunk['audio_last_sample']),
                round(1250 * (chunk['accel_last'] - chunk['accel_first'])))
            print(f'data: {num_written}')
            # print(round(1250 * (chunk['accel_last'] - chunk['accel_first'])))


In [12]:
midge_raw_data_path = os.path.join(raw_wearables_path, 'raw')
for midge_num in os.listdir(midge_raw_data_path):
    of_path = os.path.join(raw_audio_path, 'synced', f'{midge_num}.wav')
    if(os.path.exists(of_path)):
        continue

    data_folder = os.listdir(os.path.join(midge_raw_data_path, midge_num))[-1]
    midge_files = os.listdir(os.path.join(midge_raw_data_path, midge_num, data_folder))
    audio_fname = [f for f in midge_files if f.endswith('_audio_1')][-1]
    audio_fname = os.path.join(midge_raw_data_path, midge_num, data_folder, audio_fname)
    af = AudioFile(audio_fname)

    correspondances_fp = os.path.join(correspondances_path, f'{midge_num}/correspondance.csv')
    correspondances = pd.read_csv(open(correspondances_fp, 'rb'))

    print(f'MIDGE {midge_num}')
    chunks = get_annot_chunks(correspondances, af)

    try:
        correct_subject_audio(af, of_path, chunks)
    except Exception as ex:
        os.remove(of_path)
        raise ex

MIDGE 1
******************** accel: 108.4779999 - 297.368 audio: 109.7728 - 299.4176 (1072 - 2924)
blank: 135597
data: 371710
******************** accel: 298.402 - 424.5809999 audio: 300.1344 - 427.1104 (2931 - 4171)
blank: 373002
data: 530726
******************** accel: 425.4759998 - 551.75 audio: 427.52 - 554.496 (4175 - 5415)
blank: 531844
data: 689687
******************** accel: 553.1129999 - 742.8559999 audio: 555.9296 - 747.2128 (5429 - 7297)
blank: 691391
data: 928570
******************** accel: 747.0969999 - 998.2089999 audio: 750.1824 - 1002.1888000000001 (7326 - 9787)
blank: 933871
data: 1247761
******************** accel: 1001.344 - 1253.105 audio: 1005.568 - 1258.496 (9820 - 12290)
blank: 1251680
data: 1566381
******************** accel: 1254.205 - 1317.347 audio: 1259.4176 - 1322.3935999999999 (12299 - 12914)
blank: 1567756
data: 1646684
******************** accel: 1317.837 - 1380.807 audio: 1322.8032 - 1386.5984 (12918 - 13541)
blank: 1647296
data: 1726009
***************

KeyboardInterrupt: 