## First, run this cell to set up paths and import dependencies

In [None]:
import os

import matplotlib.pyplot as plt
import random
from tqdm import tqdm

if not os.path.exists(r"./notebooks"):
    %cd ..


from src.data_processing import load_audio, split_into_clips, create_spectrogram, prepare_datasets, list_all_audio_files, SOAAudioClips, save_mean_std, compute_mean_std_from_images, list_audio_files_recursively, extract_metadata, exclude_overlapping_scripts, compute_statistics,display_dataset_statistics, save_spectrogram
from src.dataset_analysis import plot_spectrogram, duration_statistics
from src.config import VALID_ACCESS_LABELS, TRAIN_DIR, TEST_DIR, VAL_DIR, DATA_DIR, DATASET_DIR,DATA_DIR_SPECIFIC
from collections import defaultdict, Counter

# Ensure the output directory structure exists
os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)
random.seed(42)  # For reproducibility

## 1 Load all .wav files in the specific provided directory and preview some

In [None]:
wav_files_specific = list_all_audio_files(DATA_DIR_SPECIFIC)
print(f"Found {len(wav_files_specific)} .wav files in directory '{DATA_DIR_SPECIFIC}")

## 1.1 For all .wav files in the specific provided directory display the statistics

In [None]:
train_files_paths, val_files_paths, test_files_paths = prepare_datasets(DATA_DIR_SPECIFIC,list_audio_files_recursively)

train_files_metadata = extract_metadata(train_files_paths,VALID_ACCESS_LABELS)
val_files_metadata = extract_metadata(val_files_paths,VALID_ACCESS_LABELS)
test_files_metadata = extract_metadata(test_files_paths,VALID_ACCESS_LABELS)

display_dataset_statistics(train_files_metadata,val_files_metadata,test_files_metadata)


## 2. Statistics about authorized/unauthorized speakers

In [None]:
authorized_speakers_files = []
unauthorized_speakers_files = []

for file in wav_files_specific:
    speaker_id = os.path.split(file)[-1].split('_')[0]
    if speaker_id in VALID_ACCESS_LABELS:
        authorized_speakers_files.append(file)
    else:
        unauthorized_speakers_files.append(file)

print("Authorized speakers recordings:")
soa_authorized = SOAAudioClips(authorized_speakers_files)
print(duration_statistics(soa_authorized.clips))

print("\nUnauthorized speakers recordings:")
soa_unauthorized = SOAAudioClips(unauthorized_speakers_files)
print(duration_statistics(soa_unauthorized.clips))

## 3. Split files into train, validation, and test sets

In [None]:
train_files, val_files, test_files = prepare_datasets(DATA_DIR_SPECIFIC)
print(f"Training files: {len(train_files)} | Validation files: {len(val_files)} | Test files: {len(test_files)}")


## 4. Calculate and display statistics about raw dataset

In [None]:
soa_train_full_clips = SOAAudioClips(train_files)
soa_test_full_clips = SOAAudioClips(test_files)
soa_val_full_clips = SOAAudioClips(val_files)

print("\nDataset Statistics:")
print("Training set:")
print(duration_statistics(soa_train_full_clips.clips))

print("Validation set:")
print(duration_statistics(soa_test_full_clips.clips))

print("Test set:")
print(duration_statistics(soa_val_full_clips.clips))

## 5. Process each dataset split by converting 3-second clips into spectrograms

In [None]:
def process_split(soa_full_clips, output_subdir):
    all_splitted_clips = []
    for file_path, full_clip in tqdm(soa_full_clips):
        clips = split_into_clips(full_clip)
        all_splitted_clips.extend(clips)
        for i, clip in enumerate(clips):
            spectrogram = create_spectrogram(clip)
            output_path = os.path.join(output_subdir, f"{os.path.basename(file_path).split('.')[0]}_{i}_clip.png")
            save_spectrogram(spectrogram, output_path)
    print(duration_statistics(all_splitted_clips))

print("Preprocessed Train Dataset:")
process_split(soa_train_full_clips, TRAIN_DIR)

print("\nPreprocessed Validation Dataset:")
process_split(soa_val_full_clips, VAL_DIR)

print("\nPreprocessed Test Dataset:")
process_split(soa_test_full_clips, TEST_DIR)


## 6. Mean and Standard deviation of training dataset

In [None]:
mean, std = compute_mean_std_from_images(TRAIN_DIR)
print(f"Mean: {mean}, Standard deviation: {std}")
save_mean_std(mean, std, f"{DATASET_DIR}/scaling_params.json")

## 7. Visualize some spectrogram examples

In [None]:
sample_spectrogram_paths = [os.path.join(TRAIN_DIR, f) for f in os.listdir(TRAIN_DIR)[:3]]
for path in sample_spectrogram_paths:
    spectrogram = plt.imread(path)
    plot_spectrogram(spectrogram, title=f"Spectrogram from {path}")