In [41]:
import numpy as np
import neuraltoolkit as ntk
import glob
import csv
import os
import smart_open
import boto3
import tqdm
import argparse
import fnmatch
import io
from natsort import natsorted
import re
import tempfile
import pandas as pd
from pathlib import Path

%pdb

Automatic pdb calling has been turned OFF


In [42]:
FS = FS0
ANIMAL = 'CAF62_day1'
NF_DIR = f'/media/bs007r/CAF00062/CAF00062_2020-11-18_16-14-24/'
#SS_DIR = '/media/HlabShare/Sleep_Scoring/CAF00049_LFP/caf4911062020/'
OUTPUT_DIR = '../data/'
OUTPUT_DIR_REMOTE = f"s3://hengenlab/{ANIMAL}/Labels/"
NUM_CHANNELS = 64
NF_PATTERN = r'Headstages*.bin'
SS_PATTERN = r'*SleepStates*'
OUTPUT_FILENAME = f"labels_sleepstate_v2.1_{ANIMAL}"
FIELDS = ['activity','sleep_state','next_wake_state','next_nrem_state','next_rem_state','last_wake_state','last_nrem_state','last_rem_state','video_filename_ix','video_frame_offset','neural_filename_ix','neural_offset']


In [43]:
def get_files(dir: str, pattern: str) -> list:
    return ntk.ntk_videos.natural_sort(glob.glob(dir+pattern))

def get_remote_files(prefix: str):
    return ntk.ntk_videos.natural_sort([file['Key'] for file in client.list_objects_v2(Bucket='hengenlab', Prefix=prefix)['Contents']])

def get_s3_client():
    with open(f"{Path.home()}/.aws/credentials", "r") as f:
        for line in f:
            if "aws_access_key_id" in line:
                id = re.search(r"\W(\w+)", line).group(1)
            if "aws_secret_access_key" in line:
                key = re.search(r"\W(\w+)", line).group(1)
        return boto3.Session(aws_access_key_id=id, aws_secret_access_key=key).client(
            "s3", endpoint_url="https://s3-central.nrp-nautilus.io"
        )

def data_sync(ignore_error=False):
    SW = pd.read_pickle('/media/HlabShare/james/SW_array_FINAL_Cleaned_MultipleScorerVersion.pkl')
    neural_files = get_files(NF_DIR, NF_PATTERN)[:288]
    print(f'Found {len(neural_files)} neural files')

    timestamp_gap = 5 * 60 * 10**9

    sleep_labels = SW[SW.Animal == re.search(f'(CAF\d+)|(KDR\d+)', ANIMAL).group()]['SW Array'].iloc[0]
    sleep_labels[sleep_labels == 5] = 1
    sleep_labels[sleep_labels == 4] = 2
    if not ignore_error and abs(sleep_labels.shape[0] - 21600) > 900: raise Exception(f'Missing sleep labels (available size {sleep_labels.shape[0] // 900})')

    with open(OUTPUT_DIR+OUTPUT_FILENAME+'.csv', 'w') as f:
        w = csv.DictWriter(f, fieldnames = FIELDS)
        w.writeheader()
        offset_tracker = 0
        offset_gap = 1 /15 * FS

        ss_ix = 0
        for neural_file_ix, n_file in enumerate(neural_files):
            current_timestamp = ntk.ntk_ecube.load_raw_binary_gain_chmap(n_file, NUM_CHANNELS, 'hs64', t_only=True)[0]
            current_file_size = (os.stat(n_file).st_size - 8) / NUM_CHANNELS / 2 

            if neural_file_ix == 0: 
                last_timestamp = current_timestamp
                last_file_size = (os.stat(n_file).st_size - 8) / NUM_CHANNELS / 2 
                
            else:
                expected_timestamp = last_timestamp + timestamp_gap
                if not ignore_error and current_timestamp - expected_timestamp > 10**10: raise Exception(f'Inconsistent timestamp: last {last_timestamp} current {current_timestamp}')
                if not ignore_error and last_file_size / FS * 10**9 != current_timestamp - last_timestamp : raise Exception(f'Inconsistent file size {last_file_size}')

            offset_checkpoint = offset_tracker
            
            while (offset_tracker-offset_checkpoint <= current_file_size) and ss_ix < sleep_labels.shape[0]:
                entry = {
                    'activity': -1,
                    'sleep_state': int(sleep_labels[ss_ix]),
                    'next_wake_state': -1,
                    'next_nrem_state': -1,
                    'next_rem_state': -1,
                    'last_wake_state': -1,
                    'last_nrem_state': -1,
                    'last_rem_state': -1,
                    'video_filename_ix': 0,
                    'video_frame_offset': 0,
                    'neural_filename_ix': neural_file_ix,
                    'neural_offset': round(offset_tracker - offset_checkpoint)
                }
                w.writerow(entry)
                offset_tracker += offset_gap
                ss_ix = round(offset_tracker) // FS // 4
                
            last_timestamp = current_timestamp
            last_file_size = current_file_size

def save_npz(client):
    with open(
        OUTPUT_DIR + OUTPUT_FILENAME + ".csv",
    ) as f:
        reader = csv.reader(f)
        next(reader, None)
        try:
            videofiles = np.array([re.search('Video/(\S+.mp4)', file).groups()[0] for file in get_remote_files(f'{ANIMAL}/Video/')])
        except:
            videofiles = np.array(['dummy'])
            
        neuralfiles = np.array([re.search('Headstages\S+', file).group(0) for file in get_remote_files(f'{ANIMAL}/Neural_Data/')])
        labels_matrix = np.array(
            [tuple(line) for line in reader],
            dtype=[
                ("activity", "i1"),
                ("sleep_state", "i1"),
                ("next_wake_state", "<i8"),
                ("next_nrem_state", "<i8"),
                ("next_rem_state", "<i8"),
                ("last_wake_state", "<i8"),
                ("last_nrem_state", "<i8"),
                ("last_rem_state", "<i8"),
                ("video_filename_ix", "<i4"),
                ("video_frame_offset", "<i4"),
                ("neural_filename_ix", "<i4"),
                ("neural_offset", "<i8"),
            ],
        )
        
        with smart_open.open(
            OUTPUT_DIR_REMOTE + OUTPUT_FILENAME + ".npz",
            "wb",
            transport_params=dict(client=client),
        ) as of:
            np.savez(
                of,
                labels_matrix=labels_matrix,
                video_files=videofiles,
                neural_files=neuralfiles,
            )


In [44]:
client = get_s3_client()
data_sync(ignore_error=True)
save_npz(client)

Found 264 neural files


In [45]:
#END