## First, run this cell to set up paths and import dependencies

In [None]:
%cd ..

import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from src.data_processing import load_audio, split_into_clips, create_spectrogram, prepare_datasets
from src.dataset_analysis import plot_spectrogram, print_duration_summary, dataset_durations
from src.config import VALID_ACCESS_LABELS, TRAIN_DIR, TEST_DIR, VAL_DIR, DATA_DIR, DATASET_DIR

# Ensure the output directory structure exists
os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)


## 1. Load all .wav files in the provided directory and preview some

In [None]:
wav_files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f.endswith('.wav')]
print(f"Found {len(wav_files)} .wav files in directory '{DATA_DIR}'")

## 2. Statistics about authorized/unauthorized speakers

In [None]:
authorized_speakers_files = []
unauthorized_speakers_files = []

for file in wav_files:
    speaker_id = file.split('/')[-1].split('_')[0]
    if speaker_id in VALID_ACCESS_LABELS:
        authorized_speakers_files.append(file)
    else:
        unauthorized_speakers_files.append(file)

print("Authorized speakers recordings:")
durations = dataset_durations(authorized_speakers_files)
print_duration_summary(durations)

print("\nUnauthorized speakers recordings:")
durations = dataset_durations(unauthorized_speakers_files)
print_duration_summary(durations)

## 3. Split files into train, validation, and test sets

In [None]:
train_files, val_files, test_files = prepare_datasets(DATA_DIR)
print(f"Training files: {len(train_files)} | Validation files: {len(val_files)} | Test files: {len(test_files)}")

def save_spectrogram(spectrogram, output_path):
    plt.imsave(output_path, spectrogram, cmap='gray')


## 4. Calculate and display statistics about raw dataset

In [None]:
print("\nDataset Statistics:")
print("Training set:")
durations = dataset_durations(train_files)
print_duration_summary(durations)

print("\nValidation set:")
durations = dataset_durations(val_files)
print_duration_summary(durations)

print("\nTest set:")
durations = dataset_durations(test_files)
print_duration_summary(durations)

## 5. Process each dataset split by converting 2-second clips into spectrograms

In [None]:
def process_split(file_list, output_subdir):
    durations = []
    for file_path in tqdm(file_list, desc=f"Processing {output_subdir}"):
        audio, sr = load_audio(file_path)
        clips = split_into_clips(audio, sample_rate=sr)

        for i, clip in enumerate(clips):
            durations.append(len(clip) / sr)
            spectrogram = create_spectrogram(clip, sr)
            output_path = os.path.join(output_subdir, f"{os.path.basename(file_path).split('.')[0]}_{i}_clip.png")
            save_spectrogram(spectrogram, output_path)
    print_duration_summary(durations)

print("Preprocessed Train Dataset:")
process_split(train_files, TRAIN_DIR)

print("\nPreprocessed Validation Dataset:")
process_split(val_files, VAL_DIR)

print("\nPreprocessed Test Dataset:")
process_split(test_files, TEST_DIR)


## 6. Visualize some spectrogram examples

In [None]:
sample_spectrogram_paths = [os.path.join(TRAIN_DIR, f) for f in os.listdir(TRAIN_DIR)[:3]]
for path in sample_spectrogram_paths:
    spectrogram = plt.imread(path)
    plot_spectrogram(spectrogram, title=f"Spectrogram from {path}")