# Restore Punctuation — Metadata Batch

Restores missing punctuation in TTS/ASR transcripts using a multilingual transformer model.
Useful when working with ASR outputs or crowd-sourced datasets that lack proper punctuation.

Reads a `metadata.csv` (pipe-separated `filename|text`) and writes a punctuated version.

---

> GPU recommended for speed. Uses `deepmultilingualpunctuation`.

In [None]:
import os

# ════════════════════════════════════════════
# ⚙️  CONFIGURATION
# ════════════════════════════════════════════
os.environ.setdefault("INPUT_CSV",   "/data/metadata.csv")          # Input metadata
os.environ.setdefault("OUTPUT_CSV",  "/data/metadata_punct.csv")    # Output metadata
os.environ.setdefault("CSV_SEP",     "|")
os.environ.setdefault("MODEL_NAME",  "kredor/punctuate-all")        # HF model

INPUT_CSV  = os.environ["INPUT_CSV"]
OUTPUT_CSV = os.environ["OUTPUT_CSV"]
CSV_SEP    = os.environ["CSV_SEP"]
MODEL_NAME = os.environ["MODEL_NAME"]

print(f"Input:  {INPUT_CSV}")
print(f"Output: {OUTPUT_CSV}")
print(f"Model:  {MODEL_NAME}")

In [None]:
!pip install --quiet --break-system-packages deepmultilingualpunctuation tf-keras pandas tqdm

In [None]:
import pandas as pd
from tqdm import tqdm
from deepmultilingualpunctuation import PunctuationModel

# Load metadata
try:
    df = pd.read_csv(INPUT_CSV, sep=CSV_SEP,
                     names=["file_name", "transcript"], header=None)
except Exception:
    df = pd.read_csv(INPUT_CSV)

print(f"Loaded {len(df)} rows")

# Load punctuation model
print(f"Loading model: {MODEL_NAME}...")
model = PunctuationModel(MODEL_NAME)

def restore(text):
    if pd.isna(text) or str(text).strip() == "":
        return text
    try:
        return model.restore_punctuation(str(text))
    except Exception:
        return text

tqdm.pandas(desc="Restoring punctuation")
df["transcript"] = df["transcript"].progress_apply(restore)

# Save
df.to_csv(OUTPUT_CSV, sep=CSV_SEP, index=False, header=False)
print(f"\nDone! Saved to {OUTPUT_CSV}")