In [1]:
import pickle
import matplotlib.pyplot as plt
import librosa
import numpy as np
import os
import librosa.display
import json
from tqdm import tqdm
# import pandas as pd
import random

In [2]:
pickle_path = '/scratch/da3245/datasets/neuro_scans/cnn_training_set.pkl'
#pickle_path = '/scratch/da3245/datasets/neuro_scans/KG127-EXPT_1_SCAN_146.pickle'
with open(pickle_path, 'rb') as f:
    fdata = pickle.load(f)

In [1]:
def func2_diverse(data, name, max_samples_per_combination):
    temp = data['SOMA']
    events = temp['SPIKE_INTERVAL']
    denoised_trace = temp['DENOISED_TRACE']

    output_dir = f"/scratch/cpk286/datasets/data_rich_balanced/{name}/"
    os.makedirs(output_dir, exist_ok=True)

    mask_json_output_path = os.path.join(output_dir, "mask.json")
    feature_output_path = os.path.join(output_dir, "features.npy")

    mask = {}
    all_features = []

    sr = 3328
    n_fft = 12
    hop_length = 1
    n_mels = 7

    mel_spectrogram = librosa.feature.melspectrogram(
        y=denoised_trace,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        power=2
    )
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # print(mel_spectrogram_db.shape)
    features = mel_spectrogram_db.T

    labels = events
    # print(labels)

    event_indices = np.where(labels == 1)[0]
    non_event_indices = np.where(labels == 0)[0]

    # Balance the dataset - we know that len(event_indices) < len(non_event_indices)
    balanced_event_indices = random.sample(
        list(event_indices),
        len(event_indices)
    )
    balanced_non_event_indices = random.sample(
        list(non_event_indices),
        len(event_indices)
    )

    # Subsample the per combination limit 
    selected_indices = balanced_event_indices[:int(max_samples_per_combination*0.5)] + balanced_non_event_indices[:int(max_samples_per_combination*0.5)]

    # Save the features and labels

    buffer = 2
    for idx, i in enumerate(selected_indices):
        mask[f"sample_{idx + 1}"] = labels[i]
                
        if i - buffer < 0 or i + 1 + buffer > len(features):  # Adjusted to account for exclusive slicing
            continue
        
        expanded_features = np.concatenate(features[i - buffer: i + 1 + buffer], axis=0)
        all_features.append(expanded_features)

    # Save results
    with open(mask_json_output_path, "w") as json_file:
        json.dump(mask, json_file, indent=1)

    all_features = np.array(all_features)
    np.save(feature_output_path, all_features)
    # print(f"Saved features to {feature_output_path}")


# Determine per comb limit
total_combinations = sum(len(fdata[expt]) for expt in fdata.keys())
total_samples = 100000  # This controls dataset size
samples_per_combination = total_samples // total_combinations
print(samples_per_combination)

# Process data
for expt in tqdm(fdata.keys()):
    for scan in fdata[expt]:
        data = fdata[expt][scan]
        func2_diverse(data, f'{expt}_{scan}', samples_per_combination)


NameError: name 'fdata' is not defined