In [None]:
# Converts Opus files to mp3 using ffmpeg and uses a small WHISPER model for free to convert it into text.
# Allows .zip, .opus, and .mp3 files to directly be transcribed.


!apt-get update -qq && apt-get install -y -qq ffmpeg
!pip install -q git+https://github.com/openai/whisper.git

import os
import zipfile
from google.colab import files
import whisper
import torch
from pathlib import Path
import tempfile
import shutil

In [None]:
3
def convert_opus_to_mp3(opus_list, bitrate='192k'):
    mp3_files = []
    for opus in opus_list:
        base, _ = os.path.splitext(opus)
        mp3 = f"{base}.mp3"
        os.system(f'ffmpeg -y -i "{opus}" -acodec libmp3lame -ab {bitrate} "{mp3}" >/dev/null 2>&1')
        mp3_files.append(mp3)
        print(f"✅ Converted {opus} → {mp3}")
    return mp3_files

def extract_media_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall()
    opus_files, mp3_files = [], []
    for root, _, files in os.walk('.'):
        for fn in files:
            full = os.path.join(root, fn)
            if fn.lower().endswith('.opus'):
                opus_files.append(full)
            elif fn.lower().endswith('.mp3'):
                mp3_files.append(full)
    return opus_files, mp3_files

def transcribe_files(file_list, model_name='base'):
    model = whisper.load_model(model_name)
    use_fp16 = torch.cuda.is_available()
    transcripts = {}
    for f in file_list:
        if not os.path.exists(f):
            print(f"⚠️ File not found: {f}, skipping.")
            continue
        print(f"\n📝 Transcribing {f} …")
        res = model.transcribe(f, fp16=use_fp16)
        transcripts[f] = res.get("text", "")
    return transcripts

def download_files(file_list, archive_name='converted_media.zip'):
    if not file_list:
        return
    if len(file_list) == 1:
        print(f"⬇️ Downloading {file_list[0]} …")
        files.download(file_list[0])
    else:
        print(f"📦 Zipping {len(file_list)} files into {archive_name} …")
        with zipfile.ZipFile(archive_name, 'w') as z:
            for f in file_list:
                z.write(f)
        print("⬇️ Downloading archive …")
        files.download(archive_name)

In [None]:
print("⬆️  Upload audio file(s) (.mp3, .opus) or a .zip archive with them:")
uploads = files.upload()


mp3_direct, opus_direct, zip_uploads = [], [], []
for fname in uploads.keys():
    ext = Path(fname).suffix.lower()
    if   ext == ".mp3":  mp3_direct.append(Path(fname))
    elif ext == ".opus": opus_direct.append(Path(fname))
    elif ext == ".zip":  zip_uploads.append(Path(fname))


all_mp3s  = []
seen_mp3  = set()
seen_opus = set()

# Handle each ZIP in its own temp folder
for z in zip_uploads:
    tmp_dir = Path(tempfile.mkdtemp(prefix=f"{z.stem}_"))
    print(f"\n📂 Extracting ZIP → {tmp_dir}")
    with zipfile.ZipFile(z, "r") as zip_ref:
        zip_ref.extractall(tmp_dir)

    for p in tmp_dir.rglob("*"):
        if p.suffix.lower() == ".mp3"  and p not in seen_mp3:
            all_mp3s.append(p)
            seen_mp3.add(p)
        elif p.suffix.lower() == ".opus" and p not in seen_opus:
            opus_direct.append(p)
            seen_opus.add(p)

if opus_direct:
    print(f"\n🎙️  Converting {len(opus_direct)} .opus → .mp3 …")
    converted = convert_opus_to_mp3([str(p) for p in opus_direct])
    for mp3 in converted:
        p = Path(mp3)
        if p not in seen_mp3:
            all_mp3s.append(p)
            seen_mp3.add(p)

for p in mp3_direct:
    if p not in seen_mp3:
        all_mp3s.append(p)
        seen_mp3.add(p)


all_mp3s = [str(p) for p in all_mp3s if p.exists()]
if not all_mp3s:
    raise ValueError("❌ No MP3 files found or generated – upload problem?")

print("\n✅ Final MP3 list:", all_mp3s)

download_files(all_mp3s, archive_name="all_voice_notes_mp3.zip")
transcripts = transcribe_files(all_mp3s)

for fn, txt in transcripts.items():
    print(f"\n— Transcript for {Path(fn).name} —\n{txt}\n")

for z in zip_uploads:
    temp_root = next(Path(".").glob(f"{z.stem}_*"), None)
    if temp_root and temp_root.is_dir():
        shutil.rmtree(temp_root, ignore_errors=True)

⬆️  Upload audio file(s) (.mp3, .opus) or a .zip archive with them:


Saving Opus stuff.zip to Opus stuff.zip

📂 Extracting ZIP → /tmp/Opus stuff_ebmi9b83

🎙️  Converting 111 .opus → .mp3 …
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250322-WA0023.opus → /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250322-WA0023.mp3
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250404-WA0003.opus → /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250404-WA0003.mp3
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250328-WA0013.opus → /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250328-WA0013.mp3
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250225-WA0003.opus → /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250225-WA0003.mp3
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250316-WA0020.opus → /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250316-WA0020.mp3
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250129-WA0011.opus → /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250129-WA0011.mp3
✅ Converted /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20241210-WA0005.op

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 83.7MiB/s]



📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250322-WA0023.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250404-WA0003.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250328-WA0013.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250225-WA0003.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250316-WA0020.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250129-WA0011.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20241210-WA0005.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250324-WA0003.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250118-WA0010.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250316-WA0024.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250115-WA0014.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250212-WA0010.mp3 …

📝 Transcribing /tmp/Opus stuff_ebmi9b83/Opus stuff/PTT-20250320