## First, run this cell to set up paths and import dependencies

In [1]:
import os

import matplotlib.pyplot as plt
import random
from tqdm import tqdm

if not os.path.exists(r"./notebooks"):
    %cd ..


from src.data_processing import load_audio, split_into_clips, create_spectrogram, prepare_datasets, list_all_audio_files, SOAAudioClips, save_mean_std, compute_mean_std_from_images, list_audio_files_recursively, extract_metadata, exclude_overlapping_scripts, compute_statistics,display_dataset_statistics, save_spectrogram
from src.dataset_analysis import plot_spectrogram, duration_statistics
from src.config import VALID_ACCESS_LABELS, TRAIN_DIR, TEST_DIR, VAL_DIR, DATA_DIR, DATASET_DIR,DATA_DIR_SPECIFIC
from collections import defaultdict, Counter

# Ensure the output directory structure exists
os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)
random.seed(42)  # For reproducibility

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


c:\Users\wojciech.basinski\iml


## 1  Load all .wav files from the dataset

In [4]:

allowed_dictionaries=["ipad_balcony1","ipad_bedroom1","ipad_confroom1","ipad_confroom2","ipad_livingroom1","ipad_office1","ipad_office2""ipadflat_confroom1""ipadflat_office1","iphone_balcony1","iphone_bedroom1","iphone_livingroom1"]

wav_files_all = list_audio_files_recursively(DATA_DIR,allowed_dictionaries)
print(f"Found {len(wav_files_all)} .wav files in directory '{DATA_DIR}' in the following allowed directories: {allowed_dictionaries}")

Found 900 .wav files in directory './data' in the following allowed directories: ['ipad_balcony1', 'ipad_bedroom1', 'ipad_confroom1', 'ipad_confroom2', 'ipad_livingroom1', 'ipad_office1', 'ipad_office2ipadflat_confroom1ipadflat_office1', 'iphone_balcony1', 'iphone_bedroom1', 'iphone_livingroom1']


## 2 Balance all .wav files, split it and display statistics

In [5]:

# Step 1: Parse Metadata
metadata = extract_metadata(wav_files_all,VALID_ACCESS_LABELS)

# Step 2: Group by Speaker
grouped_by_speaker = defaultdict(list)
for entry in metadata:
    grouped_by_speaker[entry["speaker"]].append(entry)

# Step 3: Balance Dataset
authorized_files = [entry for entry in metadata if entry["authorized"]]
unauthorized_files = [entry for entry in metadata if not entry["authorized"]]

# Maintain 1:1 ratio for train dataset
num_authorized = len(authorized_files)
unauthorized_sampled = random.sample(unauthorized_files, min(len(unauthorized_files), num_authorized))

# Combine for balanced training pool
balanced_pool = authorized_files + unauthorized_sampled

# Shuffle the balanced pool for randomness
random.shuffle(balanced_pool)

# Step 4: Split into Train, Validation, and Test
train_split = int(0.7 * len(balanced_pool))
validation_split = int(0.85 * len(balanced_pool))

train_files_balanced = balanced_pool[:train_split]
validation_files_balanced = balanced_pool[train_split:validation_split]
test_files_balanced = balanced_pool[validation_split:]

# Step 5: Prevent Data Leakage
validation_files_balanced, test_files_balanced = exclude_overlapping_scripts(train_files_balanced, validation_files_balanced, test_files_balanced)

# Step 6: Display Statistics
display_dataset_statistics(train_files_balanced,validation_files_balanced,test_files_balanced)


Train Dataset Statistics: {'Total Files': 378, 'Authorized': 194, 'Unauthorized': 184, 'Speakers': Counter({'m8': 35, 'f8': 34, 'm6': 32, 'm3': 31, 'f1': 31, 'f7': 31, 'f6': 19, 'm1': 17, 'f5': 17, 'm10': 14, 'f9': 14, 'm7': 14, 'm9': 14, 'f4': 13, 'f3': 13, 'f10': 13, 'm2': 11, 'm5': 11, 'm4': 10, 'f2': 4}), 'Devices': Counter({'ipad': 243, 'iphone': 135}), 'Rooms': Counter({'balcony1': 88, 'livingroom1': 81, 'bedroom1': 80, 'confroom2': 51, 'office1': 43, 'confroom1': 35})}
Validation Dataset Statistics: {'Total Files': 10, 'Authorized': 5, 'Unauthorized': 5, 'Speakers': Counter({'f1': 2, 'f2': 1, 'm6': 1, 'm2': 1, 'f10': 1, 'f7': 1, 'm10': 1, 'm3': 1, 'f3': 1}), 'Devices': Counter({'ipad': 6, 'iphone': 4}), 'Rooms': Counter({'bedroom1': 4, 'balcony1': 2, 'livingroom1': 2, 'confroom2': 1, 'office1': 1})}
Test Dataset Statistics: {'Total Files': 10, 'Authorized': 6, 'Unauthorized': 4, 'Speakers': Counter({'f8': 3, 'f1': 2, 'f3': 2, 'f2': 1, 'm8': 1, 'f5': 1}), 'Devices': Counter({'ipa

## 3 Display files info

In [6]:
train_files_paths = [entry['path'] for entry in train_files_balanced]
test_files_paths = [entry['path'] for entry in validation_files_balanced]
val_files_paths = [entry['path'] for entry in test_files_balanced]

soa_train_full_clips = SOAAudioClips(train_files_paths)
soa_test_full_clips = SOAAudioClips(test_files_paths)
soa_val_full_clips = SOAAudioClips(val_files_paths)

print("\nDataset Statistics:")
print("Training set:")
print(duration_statistics(soa_train_full_clips.clips))

print("Validation set:")
print(duration_statistics(soa_test_full_clips.clips))

print("Test set:")
print(duration_statistics(soa_val_full_clips.clips))






Dataset Statistics:
Training set:
Statistics:
        Total files: 378,
        Total duration: 61226.76 sec,
        Average duration: 161.98 sec, 
        Duration range: 116.16 - 224.20 sec
        
Validation set:
Statistics:
        Total files: 10,
        Total duration: 1660.39 sec,
        Average duration: 166.04 sec, 
        Duration range: 132.03 - 198.84 sec
        
Test set:
Statistics:
        Total files: 10,
        Total duration: 1727.28 sec,
        Average duration: 172.73 sec, 
        Duration range: 159.80 - 189.09 sec
        


## 4 Process each dataset split by converting 3-second clips into spectrograms

In [None]:
def process_split(soa_full_clips, output_subdir):
    all_splitted_clips = []
    for file_path, full_clip in tqdm(soa_full_clips):
        clips = split_into_clips(full_clip)
        all_splitted_clips.extend(clips)
        for i, clip in enumerate(clips):
            spectrogram = create_spectrogram(clip)
            output_path = os.path.join(output_subdir, f"{os.path.basename(file_path).split('.')[0]}_{i}_clip.png")
            save_spectrogram(spectrogram, output_path)
    print(duration_statistics(all_splitted_clips))

print("Preprocessed Train Dataset:")
process_split(soa_train_full_clips, TRAIN_DIR)

print("\nPreprocessed Validation Dataset:")
process_split(soa_val_full_clips, VAL_DIR)

print("\nPreprocessed Test Dataset:")
process_split(soa_test_full_clips, TEST_DIR)

Preprocessed Train Dataset:


 23%|██▎       | 88/378 [00:47<02:29,  1.94it/s]

## 5 Mean and Standard deviation of training dataset

In [None]:
mean, std = compute_mean_std_from_images(TRAIN_DIR)
print(f"Mean: {mean}, Standard deviation: {std}")
save_mean_std(mean, std, f"{DATASET_DIR}/scaling_params.json")