# Setting up the tools (this takes a moment)

Before we start, we need to install some essential tools to process audio and video. Think of these as the software needed on your computer to run the next steps. This will take a few moments.

## Upload your audio or video file

Now, it's time to upload the file you want to transcribe. This could be an audio file (like an MP3 or WAV) or a video file (like an MP4 or MOV). Click the "Choose Files" button that appears below to select your file from your computer.

Once uploaded, the notebook will remember where your file is saved so we can use it in the next steps.

In [None]:

!apt -y install ffmpeg

# Python deps (local only, no paid API)
!pip install -q faster-whisper aksharamukha open-tamil pydub

# If you get GPU, this will be 🔥 fast. Otherwise it still runs on CPU.


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/

In [1]:
# 1) Verify GPU present
!nvidia-smi

Sun Sep 21 09:01:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.88                 Driver Version: 576.88         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   55C    P0             18W /   82W |       7MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from pathlib import Path
from google.colab import files

# Change this to your audio/video file (wav/mp3/m4a/mp4/mov etc. all fine)
# AUDIO_PATH = "/content/Solar_1.mp3"

# Upload a file
uploaded = files.upload()

# Get the path of the uploaded file
if uploaded:
  AUDIO_PATH = list(uploaded.keys())[0]
  print(f"Uploaded file: {AUDIO_PATH}")
else:
  AUDIO_PATH = None
  print("No file uploaded.")


assert Path(AUDIO_PATH).exists(), f"File not found: {AUDIO_PATH}"

ModuleNotFoundError: No module named 'google.colab'

Transcribe (offline) with faster-whisper

In [None]:


# 2) Install with CUDA-enabled ctranslate2 (safe to re-run)
!pip install -q --upgrade "faster-whisper>=1.0.0" "ctranslate2>=4.4,<5"

# 3) Run Whisper on GPU in float16
from faster_whisper import WhisperModel
import math, os

AUDIO_PATH = "/content/Solar_1.mp3"  # your file

# Pick a size that fits VRAM:
# "small" (~1.2GB), "medium" (~3GB), "large-v3" (~9GB)
MODEL_SIZE = "large-v3"

# Force GPU + FP16
model = WhisperModel(
    MODEL_SIZE,
    device="cuda",              # <-- THIS is the fix
    compute_type="float16",     # fast on T4/L4/A100
)

segments, info = model.transcribe(
    AUDIO_PATH,
    language="ta",                         # force Tamil
    vad_filter=True,                       # keeps segments clean
    vad_parameters=dict(min_silence_duration_ms=300),
    beam_size=1,                           # greedy = faster
    temperature=0.0,                       # faster/consistent
    condition_on_previous_text=False,      # speed boost
    word_timestamps=False,
)

print(f"Language: {info.language} | Duration: {info.duration:.1f}s")
seg_list = list(segments)
print("Segments:", len(seg_list))


Sat Sep 20 04:16:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P0             26W /   70W |    3206MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Helpers: time formatting + SRT writer

In [None]:
def srt_timestamp(t):
    # t in seconds → "HH:MM:SS,mmm"
    hours = int(t // 3600)
    minutes = int((t % 3600) // 60)
    seconds = int(t % 60)
    millis = int((t - math.floor(t)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"

def write_srt(segments, texts, path):
    with open(path, "w", encoding="utf-8") as f:
        for i, (seg, txt) in enumerate(zip(segments, texts), start=1):
            f.write(str(i) + "\n")
            f.write(srt_timestamp(seg.start) + " --> " + srt_timestamp(seg.end) + "\n")
            f.write(txt.strip() + "\n\n")


Tamil → Tanglish transliteration

In [None]:
# Option A: Aksharamukha (Tamil → Latin)
from aksharamukha import transliterate as ak_trans

def ta_to_tanglish_ak(text):
    # "ISO" gives clean Latin; "IAST" is also possible.
    # If you want looser "WhatsApp-style" spellings, see the open-tamil fallback below.
    return ak_trans.process("Tamil", "ISO", text)

# Option B (fallback): open-tamil (Azhagi flavor)
# NOTE: This creates ASCII-style phonetics many Tamil users type with.
try:
    from tamil.transliterate import azhagi
    def ta_to_tanglish_azhagi(text):
        # azhagi supports Unicode→ASCII style mapping
        return azhagi.Unicode2ASCII(text)
    HAVE_AZHAGI = True
except Exception:
    HAVE_AZHAGI = False

def ta_to_tanglish(text):
    try:
        out = ta_to_tanglish_ak(text)
        # Post-fix: lowercase for subtitle friendliness
        return out.lower()
    except Exception:
        if HAVE_AZHAGI:
            return ta_to_tanglish_azhagi(text).lower()
        else:
            # If everything fails, just return original
            return text


Build the outputs (Tanglish SRT + Tamil SRT + TXT)

In [None]:
tamil_lines = [s.text.strip() for s in seg_list]
tanglish_lines = [ta_to_tanglish(s.text) for s in seg_list]

# Files
TGL_SRT = "/content/output_tanglish.srt"
TA_SRT  = "/content/output_tamil.srt"
TGL_TXT = "/content/output_tanglish.txt"

# Write SRTs
write_srt(seg_list, tanglish_lines, TGL_SRT)
write_srt(seg_list, tamil_lines, TA_SRT)

# Plain Tanglish transcript
with open(TGL_TXT, "w", encoding="utf-8") as f:
    f.write("\n".join(tanglish_lines))

TGL_SRT, TA_SRT, TGL_TXT


('/content/output_tanglish.srt',
 '/content/output_tamil.srt',
 '/content/output_tanglish.txt')

(Optional) Clean-up rules for more “WhatsApp-Tanglish

In [None]:
import re

def casualize_tanglish(line):
    # vibe tweaks: aa → a, oo → u in some contexts, remove double spaces, etc.
    line = re.sub(r"\bnaa\b", "na", line)   # naa → na
    line = re.sub(r"oo", "u", line)         # food→fud style, careful but okay for captions
    line = re.sub(r"\s{2,}", " ", line)
    return line

tanglish_lines_relaxed = [casualize_tanglish(l) for l in tanglish_lines]
write_srt(seg_list, tanglish_lines_relaxed, "/content/output_tanglish_relaxed.srt")
"/content/output_tanglish_relaxed.srt"


More refined


In [None]:
# --- Install once per runtime ---
!apt -y install ffmpeg
!pip install -q aksharamukha

import re, math
from pathlib import Path
from aksharamukha import transliterate as ak_trans

# ==== Paths ====
IN_SRT_TA = Path("/content/output_tamil.srt")           # your Tamil SRT
OUT_SRT_TANGLISH = Path("/content/output_tanglish_v2.srt")
OUT_TXT_TANGLISH = Path("/content/output_tanglish_v2.txt")
OVERRIDES_PATH = Path("/content/tanglish_overrides.txt")

assert IN_SRT_TA.exists(), f"Missing: {IN_SRT_TA}"

# ==== Style toggles ====
PREFER_LA_FOR_ZH = True    # True: "la" (tamila) | False: "zh" (tamizh)

# ISO (with diacritics) -> ASCII social mapping
ISO_TO_ASCII = {
    # long vowels
    "ā": "aa", "ī": "ii", "ū": "uu",
    "ē": "e",  "ō": "o",
    # consonants w/ diacritics
    "ṭ": "t", "ḍ": "d", "ṇ": "n", "ṉ": "n", "ṅ": "ng", "ñ": "ny",
    "ḷ": "l", "ḻ": "zh", "ṟ": "r", "ś": "sh", "ṣ": "sh",
    "ṁ": "m", "ḥ": "h",
    # punctuation/oddities
    "’": "", "'": "",
}

def iso_to_ascii_social(s: str) -> str:
    # 1) diacritics -> ascii
    s = "".join(ISO_TO_ASCII.get(ch, ch) for ch in s)
    # 2) zh -> la if you prefer
    if PREFER_LA_FOR_ZH:
        s = re.sub(r"zh", "la", s)
    # 3) subtle colloquial passes (conservative)
    s = s.lower()
    s = re.sub(r"\s{2,}", " ", s) # Fixed: Added s as the third argument
    s = re.sub(r"\billai\b", "illa", s) # Fixed: Added s as the third argument
    s = re.sub(r"\beppadi\b", "epdi", s)   # vibe-y short form # Fixed: Added s as the third argument
    s = re.sub(r"\birukku\b", "irukku", s) # Fixed: Added s as the third argument
    s = re.sub(r"\bvaa?ng[a|o]?\b", "vaanga", s)  # broad catch; tweak later # Fixed: Added s as the third argument
    return s

# Default overrides (Tamil->Tanglish) applied BEFORE transliteration
DEFAULT_OVERRIDES = {
    "தமிழ்": "tamil",
    "வணக்கம்": "vanakkam",
    "பண்ணுங்க": "pannunga",
    "எப்படி": "epdi",
    "இல்லை": "illa",
    "சரி": "seri",
    "மன்னிக்கவும்": "mannikkavum",
    "நன்றி": "nandri",
    "உங்களுக்கு": "ungalukku",
    "என்ன": "enna",
    "வேண்டும்": "venum",
    "இருக்கு": "irukku"
}

# Create a template overrides file if it doesn’t exist
if not OVERRIDES_PATH.exists():
    OVERRIDES_PATH.write_text(
        "# Tamil\tTanglish (your preferred spelling)\n"
        "# Example lines:\n"
        "அண்ணன்\tannan\n"
        "எல்லாம்\tyellam\n",
        encoding="utf-8"
    )

# Load user overrides (tab-separated)
USER_OVERRIDES = {}
for line in OVERRIDES_PATH.read_text(encoding="utf-8").splitlines():
    line = line.strip()
    if not line or line.startswith("#") or "\t" not in line:
        continue
    ta, tg = line.split("\t", 1)
    USER_OVERRIDES[ta.strip()] = tg.strip()

TAMIL_RE = re.compile(r"[\u0B80-\u0BFF]")

def apply_overrides_tamil_stage(tamil_text: str) -> str:
    # replace longer strings first
    pairs = sorted({**DEFAULT_OVERRIDES, **USER_OVERRIDES}.items(),
                   key=lambda x: len(x[0]), reverse=True)
    out = tamil_text
    for ta, tg in pairs:
        out = re.sub(re.escape(ta), tg, out)
    return out

def tamil_to_tanglish(line: str) -> str:
    if not TAMIL_RE.search(line):
        return line
    pre = apply_overrides_tamil_stage(line)
    # Tamil -> ISO Latin
    iso = ak_trans.process("Tamil", "ISO", pre)
    # ISO -> ASCII Tanglish vibe
    return iso_to_ascii_social(iso)

def is_timestamp_line(line: str) -> bool:
    return bool(re.match(r"\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3}", line.strip()))

def convert_srt(in_path: Path, out_path: Path, out_txt: Path):
    tanglish_lines = []
    out_lines = []
    for raw in in_path.read_text(encoding="utf-8").splitlines():
        if not raw.strip():
            out_lines.append(raw)
            continue
        if raw.strip().isdigit() or is_timestamp_line(raw):
            out_lines.append(raw)
            continue
        new = tamil_to_tanglish(raw)
        out_lines.append(new)
        if new.strip() and not new.strip().isdigit() and not is_timestamp_line(new):
            tanglish_lines.append(new)

    out_path.write_text("\n".join(out_lines), encoding="utf-8")
    out_txt.write_text("\n".join(tanglish_lines), encoding="utf-8")

convert_srt(IN_SRT_TA, OUT_SRT_TANGLISH, OUT_TXT_TANGLISH)

print("Done ✅")
print("Tanglish SRT:", OUT_SRT_TANGLISH)
print("Tanglish TXT:", OUT_TXT_TANGLISH)
print("Overrides file:", OVERRIDES_PATH)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Done ✅
Tanglish SRT: /content/output_tanglish_v2.srt
Tanglish TXT: /content/output_tanglish_v2.txt
Overrides file: /content/tanglish_overrides.txt
