In [1]:
!pip -q install -U transformers datasets accelerate librosa soundfile
!apt -yq install ffmpeg


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.[0m[31m
[0mReading package lists...
Building dependency tree

In [5]:
from google.colab import files
from IPython.display import Audio, display
import os

print("⬆️ Choose an audio file (mp3/wav/m4a/mp4...)")
up = files.upload()  # pick a single file
audio_path = next(iter(up.keys()))  # first uploaded filename



print("✅ Loaded:", audio_path)
display(Audio(audio_path))  # INLINE PLAYER


⬆️ Choose an audio file (mp3/wav/m4a/mp4...)


Saving Text to Speech.m4a to Text to Speech.m4a
✅ Loaded: Text to Speech.m4a


In [6]:
import torch
from transformers import pipeline

# Use GPU if available for speed
device = 0 if torch.cuda.is_available() else -1

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",         # you can try "medium" or "large-v3" if you have VRAM
    chunk_length_s=30,                    # long-form chunking
    device=device,
    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
    generate_kwargs={"task": "transcribe", "language": "en"},  # skip language detection
    ignore_warning=True,                  # silence chunking warning
)

# Plain text
res_text = pipe(audio_path, batch_size=8)
text = res_text["text"]

# With per-chunk timestamps
res_chunks = pipe(audio_path, batch_size=8, return_timestamps=True)
chunks = res_chunks["chunks"]

print("=== TRANSCRIPT (plain) ===\n")
print(text[:1000] + ("..." if len(text) > 1000 else ""))
print("\nTotal chars:", len(text))


Device set to use cpu


=== TRANSCRIPT (plain) ===

 Purdue University Fort Wayne II

Total chars: 32


In [7]:
from pathlib import Path
import pandas as pd

def srt_timestamp(seconds: float) -> str:
    if seconds is None:
        seconds = 0.0
    ms = int(round(seconds * 1000))
    h = ms // 3_600_000; ms %= 3_600_000
    m = ms // 60_000;    ms %= 60_000
    s = ms // 1000;      ms %= 1000
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

def save_txt(text: str, out_path: str):
    Path(out_path).write_text(text, encoding="utf-8")
    print(f"💾 Saved TXT → {out_path}")

def save_srt(chunks, out_path: str):
    lines = []
    for i, ch in enumerate(chunks, 1):
        t0, t1 = ch["timestamp"]
        lines.append(str(i))
        lines.append(f"{srt_timestamp(t0)} --> {srt_timestamp(t1)}")
        lines.append(ch["text"].strip())
        lines.append("")  # blank line between cues
    Path(out_path).write_text("\n".join(lines), encoding="utf-8")
    print(f"💾 Saved SRT → {out_path}")

# Show chunk table
df = pd.DataFrame([{
    "start": srt_timestamp(c["timestamp"][0]),
    "end":   srt_timestamp(c["timestamp"][1]),
    "text":  c["text"].strip()
} for c in chunks])
df.head(10)


Unnamed: 0,start,end,text
0,"00:00:00,000","00:00:02,000",Purdue University Fort Wayne II


In [8]:
base = Path(audio_path).with_suffix("")
txt_path = f"{base}.txt"
srt_path = f"{base}.srt"

save_txt(text, txt_path)
save_srt(chunks, srt_path)

# Handy links for downloading from Colab
from google.colab import files as colab_files
print("⬇️ Click to download:")
colab_files.download(txt_path)
colab_files.download(srt_path)


💾 Saved TXT → Text to Speech.txt
💾 Saved SRT → Text to Speech.srt
⬇️ Click to download:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>