In [13]:
from pathlib import Path
import tarfile
import sys
import torchaudio
import soundfile as sf
import tempfile

slash_models = Path().resolve().parent.parent
sys.path.insert(0, str(slash_models))

from common.logs.logger import Logger

In [14]:
langs = ['de', 'en', 'es', 'fr', 'it', 'ru']

downloads_root = Path.cwd() / "www.repository.voxforge1.org" / "downloads"

In [15]:
# Precursor check: verify wav files exist in each language before processing
for lang in langs:
    lang_root = downloads_root / lang / "Trunk" / "Audio" / "Main" / "16kHz_16bit"
    assert lang_root.exists(), f"Language root does not exist: {lang_root}"
    
    tar_files = [p for p in lang_root.iterdir() if p.suffix in [".tar", ".tgz", ".tar.gz"]]
    assert len(tar_files) > 0, f"No tar files found for language: {lang}"
    
    # Open first tar file and check for wav files
    first_tar = tar_files[0]
    with tarfile.open(first_tar, "r:*") as tar:
        wav_members = [m for m in tar.getmembers() if m.name.endswith(".wav")]
        assert len(wav_members) > 0, f"No wav files found in {first_tar.name} for language: {lang}"
        
        print(f"✓ Language: {lang} - Found {len(wav_members)} wav files in {first_tar.name}")
        print(f"  Sample wav files:")
        for wav in wav_members[:5]:  # Print first 5 wav files as sample
            print(f"    - {wav.name}")
        print()

print("All languages have wav files. Proceeding with analysis...")

✓ Language: de - Found 10 wav files in 1337ad-20170321-amr.tgz
  Sample wav files:
    - 1337ad-20170321-amr/wav/de4-70.wav
    - 1337ad-20170321-amr/wav/de4-75.wav
    - 1337ad-20170321-amr/wav/de4-73.wav
    - 1337ad-20170321-amr/wav/de4-68.wav
    - 1337ad-20170321-amr/wav/de4-71.wav

✓ Language: en - Found 10 wav files in 1028-20100710-hne.tgz
  Sample wav files:
    - 1028-20100710-hne/wav/ar-05.wav
    - 1028-20100710-hne/wav/ar-01.wav
    - 1028-20100710-hne/wav/ar-07.wav
    - 1028-20100710-hne/wav/ar-02.wav
    - 1028-20100710-hne/wav/rp-29.wav

✓ Language: es - Found 10 wav files in abarzuaf-20101026-caf.tgz
  Sample wav files:
    - abarzuaf-20101026-caf/wav/es-0031.wav
    - abarzuaf-20101026-caf/wav/es-0027.wav
    - abarzuaf-20101026-caf/wav/es-0030.wav
    - abarzuaf-20101026-caf/wav/es-0028.wav
    - abarzuaf-20101026-caf/wav/es-0034.wav

✓ Language: fr - Found 10 wav files in 4h-20100505-vgm.tgz
  Sample wav files:
    - 4h-20100505-vgm/wav/fr-sb-764.wav
    - 4h-20100

In [16]:
def analyze_wav_files():
    wav_analsis_log = Logger(name="wav_analsis_log")
    # extract all wav files for each langauge
    for lang in langs:
        lang_root = Path(downloads_root / lang / "Trunk" / "Audio" / "Main" / "16kHz_16bit")
        assert lang_root.exists()

        tot_wavs = 0
        tot_sec = 0
        for tar_path in [tar for tar in lang_root.iterdir() if tar.suffix in [".tar", ".tgz", ".tar.gz"]]:
            with tarfile.open(tar_path, "r:*") as tar:
                wav_files = [file for file in tar.getmembers() if file.name.endswith(".wav")]
                tot_wavs += len(wav_files)
                with tempfile.TemporaryDirectory() as td:
                    for wav in wav_files:
                        extracted_path = Path(td) / Path(wav.name).name
                        with tar.extractfile(wav) as fh:
                            with open(extracted_path, "wb") as out:
                                out.write(fh.read())
                        
                        data, sr = sf.read(str(extracted_path))
                        wav_len = len(data) / sr
                        print(f"wav_len={wav_len} for file={wav.name}")
                        tot_sec += wav_len

        avg = tot_sec / tot_wavs if tot_wavs > 0 else 0
        wav_analsis_log.log(f"For lang={lang}, tot_wavs={tot_wavs}, tot_sec={tot_sec:.2f}, avg_len={avg:.2f}")

analyze_wav_files()

wav_len=5.75 for file=1337ad-20170321-amr/wav/de4-70.wav
wav_len=5.0 for file=1337ad-20170321-amr/wav/de4-75.wav
wav_len=6.0 for file=1337ad-20170321-amr/wav/de4-73.wav
wav_len=6.375 for file=1337ad-20170321-amr/wav/de4-68.wav
wav_len=4.5 for file=1337ad-20170321-amr/wav/de4-71.wav
wav_len=5.125 for file=1337ad-20170321-amr/wav/de4-76.wav
wav_len=5.375 for file=1337ad-20170321-amr/wav/de4-69.wav
wav_len=5.0 for file=1337ad-20170321-amr/wav/de4-72.wav
wav_len=5.75 for file=1337ad-20170321-amr/wav/de4-74.wav
wav_len=6.0 for file=1337ad-20170321-amr/wav/de4-67.wav
wav_len=4.75 for file=1337ad-20170321-bej/wav/de6-51.wav
wav_len=5.125 for file=1337ad-20170321-bej/wav/de6-55.wav
wav_len=4.5 for file=1337ad-20170321-bej/wav/de6-48.wav
wav_len=5.875 for file=1337ad-20170321-bej/wav/de6-49.wav
wav_len=5.0 for file=1337ad-20170321-bej/wav/de6-47.wav
wav_len=5.75 for file=1337ad-20170321-bej/wav/de6-50.wav
wav_len=5.375 for file=1337ad-20170321-bej/wav/de6-54.wav
wav_len=5.125 for file=1337ad-20

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\afkhe\\Programming\\Projects\\Machine Learning\\dynamic-auto-caption\\models\\common\\logs\\logger.py\\wav_analsis_log\\.txt'