In [None]:
import plotly.express as px
import numpy as np
import librosa
import soundfile as sf

# load raw_recordings/antonin_AI.mp3 and plot the waveform using plotly
# for AI raw recordings, this tool was used: https://luvvoice.com/

# load the audio file
audio_file = 'raw_recordings/antonin_AI.mp3'

# load the audio file
y, sr = librosa.load(audio_file)

# plot the waveform
fig = px.line(y)
fig.show()

In [None]:
print(sr)

In [None]:
# Ensure y is a NumPy array
y = np.array(y)

# Convert to mono if stereo
if y.ndim > 1:
    y = np.mean(y, axis=1)

# Normalize and convert to 16-bit PCM
y_int16 = (y * 32767).astype(np.int16)

# Save as 16kHz, 16-bit, Mono WAV
sf.write('raw_recordings/antonin_AI.wav', y_int16, samplerate=16000, subtype='PCM_16')


In [None]:
# load the audio file
audio_file = 'raw_recordings/antonin_AI.wav'

# load the audio file
y, sr = librosa.load(audio_file)

# plot the waveform
fig = px.line(y)
fig.show()

In [None]:
import numpy as np

# Assuming `y` is your audio signal as a NumPy array
y = np.array(y)

# Flatten the array if it's stereo or multidimensional
y = y.flatten()

# Get the smallest 100K samples
smallest_samples = np.partition(y, 100000)[:100000]

# Calculate the average
average_noise_val = np.mean(smallest_samples)

# Print the result
print(f"Average of the smallest 100K samples (noise value): {average_noise_val}")


In [None]:
import sounddevice as sd
import numpy as np
import soundfile as sf
import os
import time


def detect_speech(audio):
    """
    Detects speech by checking if any sample exceeds the noise threshold.
    """
    return np.max(np.abs(audio)) > THRESHOLD

def center_audio(audio):
    """
    Centers the spoken word by padding with silence on both sides to 2 seconds.
    """
    current_length = len(audio)
    if current_length >= SAMPLES:
        # If longer than 2 seconds, trim and center
        start = (current_length - SAMPLES) // 2
        return audio[start:start + SAMPLES]

    # Calculate padding needed
    total_padding = SAMPLES - current_length
    pad_before = total_padding // 2
    pad_after = total_padding - pad_before

    # Add silence before and after to center the word
    padded_audio = np.pad(audio, (pad_before, pad_after), 'constant')
    return padded_audio

def normalize_audio(audio):
    """
    Normalizes the audio to the range of int16.
    """
    audio = audio / np.max(np.abs(audio))
    return np.int16(audio * 32767)

def record_digit(digit, index):
    print(f"Recording '{digit}' ({index+1}/10).")
    print("Start speaking after the beep...")

    # Beep sound as a cue to start speaking
    sd.play(np.sin(2 * np.pi * 440 * np.arange(0, 0.2, 1/SAMPLE_RATE)), SAMPLE_RATE)
    sd.wait()

    # Record audio (up to MAX_RECORDING_TIME seconds)
    recording = sd.rec(int(MAX_RECORDING_TIME * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
    sd.wait()

    # Flatten to 1D array
    audio = recording.flatten()

    # Check if speech is detected
    if detect_speech(audio):
        print("üîä Speech detected!")

        # Find the start and end of speech
        sound_indices = np.where(np.abs(audio) > THRESHOLD)[0]
        start_index = sound_indices[0]
        end_index = sound_indices[-1]

        # Extract and center the audio
        spoken_audio = audio[start_index:end_index + 1]
        centered_audio = center_audio(spoken_audio)

        # Normalize and convert to int16
        normalized_audio = normalize_audio(centered_audio)

        # Naming format: cX_pYYYY_sZZ.wav
        filename = f"c{DIGITS.index(digit)}_p{PERSON_ID}_s{SET_NUMBER}.wav"
        output_file = os.path.join(OUTPUT_DIR, filename)

        # Save as 16-bit PCM WAV
        sf.write(output_file, normalized_audio, SAMPLE_RATE, subtype='PCM_16')
        print(f"üíæ Saved: {output_file}\n")

    else:
        print("‚ùå No significant speech detected. Please try again.\n")




In [None]:
# batch recorder
DIGITS = ['nula', 'jedna', 'dva', 'tri', 'ctyri', 'pet', 'sest', 'sedm', 'osm', 'devet']
#DIGITS = ['sest'] # pridat nulu z batche 01 a 6 z batche 5
SAMPLE_RATE = 16000  # 16 kHz
TARGET_DURATION = 2  # 2 seconds (32,000 samples)
SAMPLES = SAMPLE_RATE * TARGET_DURATION
THRESHOLD = 0.01  # Noise threshold for speech detection
MAX_RECORDING_TIME = 3  # Maximum recording time (3 seconds)
PERSON_ID = '2501'  # Change this to your ID
SET_NUMBER = '01'  # Change this for multiple sets (01 to 05)
OUTPUT_DIR = f'p{PERSON_ID}'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Main loop to record each digit in sequence
for index, digit in enumerate(DIGITS):
    input(f"Press Enter to record '{digit}'...")
    record_digit(digit, index)
    time.sleep(1)  # Short pause between recordings

print("‚úÖ All recordings completed!")

In [None]:
def print_audio_details(directory):
    """
    Loads all audio files from the specified directory and prints:
    - Sample Rate
    - File Size (in KB)
    - Audio Length (in seconds)
    """
    print(f"\nüìÇ Scanning directory: {directory}\n")
    # Loop through all files in the directory
    # Loop through all files in the directory
    for file in os.listdir(directory):
        if file.endswith('.wav') or file.endswith('.opus'):
            file_path = os.path.join(directory, file)

            # Get file size in KB
            file_size_kb = os.path.getsize(file_path) / 1024

            try:
                # Get audio information
                with sf.SoundFile(file_path) as audio_file:
                    sample_rate = audio_file.samplerate
                    frames = len(audio_file)
                    audio_length = frames / sample_rate
                    subtype = audio_file.subtype

                # Print details
                print(f"üéµ File: {file}")
                print(f"   Sample Rate: {sample_rate} Hz")
                print(f"   File Size: {file_size_kb:.2f} KB")
                print(f"   Audio Length: {audio_length:.2f} seconds")
                print(f"   Bit Depth: {subtype}\n")

            except Exception as e:
                print(f"‚ùå Could not load {file}: {e}")



In [None]:
# Specify the directory containing audio files
audio_directory = "p2501_2"
print_audio_details(audio_directory)