In [None]:
!pip install torchcodec
!pip install -q git+https://github.com/openai/whisper.git pydub spacy soundfile speechbrain transformers datasets accelerate
!python -m spacy download en_core_web_sm
!apt-get -qq install ffmpeg
!pip install -q gradio
import gradio as gr
from google.colab import files



Collecting torchcodec
  Using cached torchcodec-0.7.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Using cached torchcodec-0.7.0-cp312-cp312-manylinux_2_28_x86_64.whl (1.4 MB)
Installing collected packages: torchcodec
Successfully installed torchcodec-0.7.0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.1/754.1 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web

In [None]:
# ==========================================================================================
#  Definitive Working Code: Text-Focused PII Redaction & Re-Synthesis
#  This version resolves the CPU/GPU device error for the final audio generation.
# ==========================================================================================

import os
import sys
import subprocess
import re
import torch
import soundfile as sf
import numpy as np
from IPython.display import display, clear_output

#@title 🎙️ Run the Final, Error-Free Redaction Pipeline
#@markdown ### 1. Configure AI Model Settings
#@markdown `small.en` is a good balance of speed and accuracy for transcription.
whisper_model_size = "small.en" #@param ["base.en", "small.en", "medium.en"]

def setup_environment():
    """Installs all required libraries for the pipeline."""
    print("⏳ Installing all required libraries... (This may take a moment on the first run)")
    try:
        import whisper
        import spacy
        spacy.load("en_core_web_sm")
        import speechbrain
        import torchcodec
        print("✅ Dependencies are already installed.")
    except (ImportError, OSError):
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/openai/whisper.git", "spacy", "soundfile", "speechbrain==0.5.16", "transformers", "datasets", "accelerate", "huggingface_hub", "torch", "torchcodec", "numpy"], check=True)
        subprocess.run([sys.executable, "-m", "spacy", "download", "-q", "en_core_web_sm"], check=True)
        clear_output(wait=True)
        print("✅ Dependencies installed successfully.")

# ----------------------------
#  Start of Script Execution
# ----------------------------
setup_environment()

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from datasets import load_dataset
import whisper
import spacy

# ----- 1. Load All Models -----
print("🧠 Loading all AI models... (This may take several minutes)")
device = "cuda" if torch.cuda.is_available() else "cpu"

print("   -> Loading Whisper for Speech-to-Text...")
whisper_model = whisper.load_model(whisper_model_size)
print("   -> Loading spaCy for PII Detection...")
nlp = spacy.load("en_core_web_sm")

print("   -> Loading SpeechT5 for Text-to-Speech...")
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# --- FIX: Move the TTS model and vocoder to the correct device (GPU if available) ---
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

print("   -> Generating speaker embeddings for TTS voice...")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name))

print("   -> Creating a reference audio sample to generate a voice...")
dataset = load_dataset("librispeech_asr", "clean", split="train.100", streaming=True)
sample = next(iter(dataset))

with torch.no_grad():
    audio_array = torch.tensor(sample["audio"]["array"]).unsqueeze(0).to(device)
    speaker_embeddings = speaker_model.encode_batch(audio_array)
    speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
    speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
print("✅ All models loaded successfully.")


# ----- 2. Define Core Functions -----
PII_REGEX = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
}

def transcribe_audio(filepath):
    """Converts audio to text."""
    print("\n[Step 1/3] Converting audio to text...")
    result = whisper_model.transcribe(filepath)
    return result['text']

def detect_and_redact_text(text):
    """Finds PII in text and returns the redacted sentence."""
    print("[Step 2/3] Detecting PII and redacting text...")
    entities = []
    # Regex-based detection
    for pii_type, pattern in PII_REGEX.items():
        for match in re.finditer(pattern, text):
            entities.append({"text": match.group(0), "type": pii_type, "start": match.start(), "end": match.end()})
    # spaCy-based NER for names
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities.append({"text": ent.text, "type": "PERSON", "start": ent.start_char, "end": ent.end_char})

    if not entities:
        return text

    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)

    redacted_text = text
    for entity in sorted_entities:
        replacement = f"[{entity['type']}]"
        start, end = entity['start'], entity['end']
        redacted_text = redacted_text[:start] + replacement + redacted_text[end:]

    return redacted_text

def synthesize_speech(text):
    """Converts the final redacted text back into audio."""
    # It's currently 5:26 PM in Chennai, a fine evening.
    greeting = "Here is the redacted audio, generated on Monday evening. "
    full_text = greeting + text

    print("[Step 3/3] Converting redacted text to new audio...")
    inputs = tts_processor(text=full_text, return_tensors="pt")

    # Move all tensors to the correct device before generating speech
    input_ids = inputs["input_ids"].to(device)
    speaker_embeddings_device = speaker_embeddings.to(device)

    speech = tts_model.generate_speech(input_ids, speaker_embeddings_device, vocoder=tts_vocoder)
    return speech.cpu().numpy()

# ----- 3. Main Execution Block -----
print("-" * 50)
print("📎 Please upload your audio file (e.g., MP3, WAV, M4A)...")
try:
    uploaded_files = files.upload()
    if not uploaded_files:
        raise Exception("No file was uploaded.")

    input_filename = list(uploaded_files.keys())[0]

    print(f"\n✅ Successfully uploaded '{input_filename}'. Starting pipeline...")
    print("-" * 50)

    # --- Execute Pipeline ---
    original_text = transcribe_audio(input_filename)
    with open("original_transcript.txt", "w") as f: f.write(original_text)
    print(f"   -> Original Transcript: \"{original_text}\"")

    redacted_text = detect_and_redact_text(original_text)
    with open("redacted_transcript.txt", "w") as f: f.write(redacted_text)

    print("\n" + "="*50)
    print("✅ Redacted Text Sentence:")
    print(redacted_text)
    print("="*50 + "\n")

    final_speech = synthesize_speech(redacted_text)
    output_tts_filename = "resynthesized_redacted_audio.wav"
    sf.write(output_tts_filename, final_speech, samplerate=16000)
    print(f"   -> Saved new audio to '{output_tts_filename}'")

    print("-" * 50)
    print("\n✅ End-to-end process complete!")
    print("📁 You can now download the generated files from the file browser on the left.")

except Exception as e:
    if "FileUploadError" in str(type(e)):
        print("\nUpload canceled or failed. Please run the cell again.")
    else:
        print(f"\nAn unexpected error occurred: {e}")

⏳ Installing all required libraries... (This may take a moment on the first run)


  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


✅ Dependencies are already installed.


  from speechbrain.pretrained import EncoderClassifier


🧠 Loading all AI models... (This may take several minutes)
   -> Loading Whisper for Speech-to-Text...


100%|███████████████████████████████████████| 461M/461M [00:18<00:00, 26.5MiB/s]


   -> Loading spaCy for PII Detection...
   -> Loading SpeechT5 for Text-to-Speech...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


   -> Generating speaker embeddings for TTS voice...


model.safetensors:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/hyperparams.yaml' -> '/tmp/speechbrain/spkrec-xvect-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (o

embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/embedding_model.ckpt' -> '/tmp/speechbrain/spkrec-xvect-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /tmp/speechbrain/spkrec-xvect-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/mean_var_norm_emb.ckpt' -> '/tmp/speechbrain/spkrec-xvect-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /tmp/speechbrain/spkrec-xvect-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/classifier.ckpt' -> '/tmp/speechbrain/spkrec-xvect-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /tmp/speechbrain/spkrec-xvect-voxceleb/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/label_encoder.txt' -> '/tmp/speechbrain/spkrec-xvect-voxceleb/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /tmp/speechbrain/spkrec-xvect-voxceleb/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /tmp/speechbrain/spkrec-xvect-voxceleb/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /tmp/speechbrain/spkrec-xvect-voxceleb/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /tmp/speechbra

   -> Creating a reference audio sample to generate a voice...


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

✅ All models loaded successfully.
--------------------------------------------------
📎 Please upload your audio file (e.g., MP3, WAV, M4A)...


Saving vardhuaudi.mp3 to vardhuaudi.mp3

✅ Successfully uploaded 'vardhuaudi.mp3'. Starting pipeline...
--------------------------------------------------

[Step 1/3] Converting audio to text...




   -> Original Transcript: " So, my name is Priya Sharma. Please send the documents to my email priya.sharma123 at example.go.in My account number is 8765432112. If you have any questions, my direct line is 212-555-0182. Thank you."
[Step 2/3] Detecting PII and redacting text...

✅ Redacted Text Sentence:
 So, my name is [PERSON]. Please send the documents to my email priya.sharma123 at example.go.in My account number is [PHONE]. If you have any questions, my direct line is [PHONE]. Thank you.

[Step 3/3] Converting redacted text to new audio...
   -> Saved new audio to 'resynthesized_redacted_audio.wav'
--------------------------------------------------

✅ End-to-end process complete!
📁 You can now download the generated files from the file browser on the left.


In [None]:
!pip install torch==2.1.0 torchaudio==2.1.0 speechbrain==0.5.10 --quiet
!pip install torchcodec==0.7.0 --quiet


[31mERROR: Could not find a version that satisfies the requirement torch==2.1.0 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.0[0m[31m
[0m

In [None]:
# ==========================================================================================
#  Stable Audio PII Redaction + Voice-Preserving Resynthesis Pipeline
#  Compatible with Colab using stable package versions
# ==========================================================================================

# ----------------------------
#  1️⃣ Install & Setup Environment
# ----------------------------
!pip install torch==2.1.0 torchaudio==2.1.0 speechbrain==0.5.10 torchcodec==0.7.0 transformers datasets soundfile spacy -q
!python -m spacy download en_core_web_sm

import os
import re
import torch
import soundfile as sf
import spacy
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from datasets import load_dataset
from IPython.display import clear_output

clear_output(wait=True)
print("✅ Environment setup complete!")

# ----------------------------
#  2️⃣ Configure Device & Models
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model_size = "small.en"  # choose base.en, small.en, or medium.en

# Import whisper here after installing
import whisper
whisper_model = whisper.load_model(whisper_model_size)
nlp = spacy.load("en_core_web_sm")

# TTS Setup
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# Speaker Encoder (Voice Cloning)
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name)
)

# Load a reference audio for speaker embedding
dataset = load_dataset("librispeech_asr", "clean", split="train.100", streaming=True)
sample = next(iter(dataset))

# Convert to tensor and extract embeddings
audio_tensor = torch.tensor(sample["audio"]["array"]).unsqueeze(0).to(device)
with torch.no_grad():
    speaker_embeddings = speaker_model.encode_batch(audio_tensor)
    speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
    speaker_embeddings = speaker_embeddings.squeeze().cpu()
speaker_embeddings = speaker_embeddings.unsqueeze(0)
print("✅ All models loaded successfully!")

# ----------------------------
#  3️⃣ Core Functions
# ----------------------------
PII_REGEX = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
}

def transcribe_audio(filepath):
    """Convert audio to text using Whisper."""
    print("\n[Step 1/3] Transcribing audio...")
    result = whisper_model.transcribe(filepath)
    return result['text']

def detect_and_redact_text(text):
    """Detects PII and redacts it."""
    print("[Step 2/3] Detecting PII and redacting text...")
    entities = []

    # Regex-based detection
    for pii_type, pattern in PII_REGEX.items():
        for match in re.finditer(pattern, text):
            entities.append({"text": match.group(0), "type": pii_type, "start": match.start(), "end": match.end()})

    # spaCy NER for PERSON
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities.append({"text": ent.text, "type": "PERSON", "start": ent.start_char, "end": ent.end_char})

    if not entities:
        return text

    # Redact text
    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
    redacted_text = text
    for entity in sorted_entities:
        redacted_text = redacted_text[:entity['start']] + f"[{entity['type']}]" + redacted_text[entity['end']:]

    return redacted_text

def synthesize_speech(text):
    """Convert redacted text back to audio using TTS + original speaker voice."""
    print("[Step 3/3] Generating voice-preserved audio...")
    greeting = "Here is the redacted audio. "
    full_text = greeting + text

    inputs = tts_processor(text=full_text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    speaker_embeddings_device = speaker_embeddings.to(device)

    speech = tts_model.generate_speech(input_ids, speaker_embeddings_device, vocoder=tts_vocoder)
    return speech.cpu().numpy()

# ----------------------------
#  4️⃣ Run Full Pipeline
# ----------------------------
from google.colab import files
print("-"*50)
print("📎 Upload your audio file (MP3, WAV, M4A)...")

uploaded_files = files.upload()
input_filename = list(uploaded_files.keys())[0]
print(f"\n✅ Uploaded '{input_filename}'. Starting pipeline...")
print("-"*50)

# Transcribe
original_text = transcribe_audio(input_filename)
with open("original_transcript.txt", "w") as f: f.write(original_text)
print(f"Original Transcript: {original_text}")

# Redact
redacted_text = detect_and_redact_text(original_text)
with open("redacted_transcript.txt", "w") as f: f.write(redacted_text)
print(f"\nRedacted Transcript: {redacted_text}")

# Voice-preserving TTS
final_audio = synthesize_speech(redacted_text)
output_filename = "voice_preserved_redacted_audio.wav"
sf.write(output_filename, final_audio, samplerate=16000)
print(f"\n✅ Audio saved as '{output_filename}'")
print("You can now download the generated files from the left file browser.")


✅ Environment setup complete!


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/tmp/speechbrain/spkrec-xvect-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /tmp/speechbrain/spkrec-xvect-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/tmp/speechbrain/spkrec-xvect-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /tmp/speechbrain/spkrec-xvect-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/tmp/speechbrain/spkrec-xvect-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /tmp/speechbrain/spkrec-xvect-voxceleb/mean_var_norm_emb.ck

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

✅ All models loaded successfully!
--------------------------------------------------
📎 Upload your audio file (MP3, WAV, M4A)...


Saving vardhuaudi.mp3 to vardhuaudi (6).mp3

✅ Uploaded 'vardhuaudi (6).mp3'. Starting pipeline...
--------------------------------------------------

[Step 1/3] Transcribing audio...




Original Transcript:  So, my name is Priya Sharma. Please send the documents to my email priya.sharma123 at example.go.in My account number is 8765432112. If you have any questions, my direct line is 212-555-0182. Thank you.
[Step 2/3] Detecting PII and redacting text...

Redacted Transcript:  So, my name is [PERSON]. Please send the documents to my email priya.sharma123 at example.go.in My account number is [PHONE]. If you have any questions, my direct line is [PHONE]. Thank you.
[Step 3/3] Generating voice-preserved audio...

✅ Audio saved as 'voice_preserved_redacted_audio.wav'
You can now download the generated files from the left file browser.


HINDI RE-DACT


In [None]:
# ==========================================================================================
#  🇮🇳 Hindi Audio PII Redaction & Voice Resynthesis Pipeline
#  This system uses models optimized for the Hindi language.
# ==========================================================================================

# ----------------------------
#  1️⃣ Install & Setup Environment
# ----------------------------
print("⏳ Installing required libraries for the Hindi pipeline...")
# Note: MMS models require specific versions of some libraries
!pip install torch==2.1.0 torchaudio==2.1.0 -q
!pip install transformers accelerate sentencepiece protobuf spacy -q
!pip install git+https://github.com/openai/whisper.git -q

import os
import re
import torch
import soundfile as sf
import spacy
from transformers import pipeline, VitsModel, AutoTokenizer
from IPython.display import clear_output

clear_output(wait=True)
print("✅ Environment setup complete!")

# ----------------------------
#  2️⃣ Configure Device & Models
# ----------------------------
print("🧠 Loading all AI models... (This may take several minutes)")
device = "cuda" if torch.cuda.is_available() else "cpu"

# ASR Model (using a larger multilingual model for better accuracy)
import whisper
print("   -> Loading Whisper for Speech-to-Text...")
whisper_model = whisper.load_model("medium")

# PII Detection Model for Indian Languages (NER)
print("   -> Loading AI4Bharat NER model for PII Detection...")
ner_pipeline = pipeline("ner", model="ai4bharat/IndicNER", grouped_entities=True, device=0 if device=="cuda" else -1)

# TTS Model for Hindi
print("   -> Loading MMS model for Text-to-Speech (Hindi)...")
tts_model = VitsModel.from_pretrained("facebook/mms-tts-hin").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")

print("✅ All models loaded successfully!")

# ----------------------------
#  3️⃣ Core Functions for Hindi
# ----------------------------
# Added Regex for common Indian PII formats
PII_REGEX_HI = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"\+?91[\s-]?\d{10}",
    "AADHAAR": r"\b\d{4}\s?\d{4}\s?\d{4}\b",
    "PAN": r"\b[A-Z]{5}[0-9]{4}[A-Z]{1}\b",
}

def transcribe_audio_hindi(filepath):
    """Convert audio to Hindi text using Whisper."""
    print("\n[Step 1/3] Transcribing audio in Hindi...")
    # Specify the language as 'hi' for Hindi
    result = whisper_model.transcribe(filepath, language='hi')
    return result['text']

def detect_and_redact_text_hindi(text):
    """Detects Indian PII and redacts it."""
    print("[Step 2/3] Detecting PII and redacting Hindi text...")

    # Run NER pipeline
    ner_results = ner_pipeline(text)

    entities = []
    for entity in ner_results:
        # ai4bharat/IndicNER labels are PER (Person), ORG (Organization), LOC (Location)
        if entity['entity_group'] == 'PER':
            entities.append({"type": "PERSON", "start": entity['start'], "end": entity['end']})

    # Run Regex-based detection
    for pii_type, pattern in PII_REGEX_HI.items():
        for match in re.finditer(pattern, text):
            entities.append({"type": pii_type, "start": match.start(), "end": match.end()})

    if not entities:
        return text

    # Redact text by replacing PII with placeholders
    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
    redacted_text = text
    for entity in sorted_entities:
        redacted_text = redacted_text[:entity['start']] + f"[{entity['type']}]" + redacted_text[entity['end']:]

    return redacted_text

def synthesize_speech_hindi(text):
    """Convert redacted Hindi text back to audio using the MMS model."""
    print("[Step 3/3] Generating new Hindi audio...")
    inputs = tts_tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        speech = tts_model(**inputs).waveform

    # Convert from tensor to numpy array and ensure it's 1D
    speech_np = speech.cpu().numpy().squeeze()
    return speech_np

# ----------------------------
#  4️⃣ Run Full Pipeline
# ----------------------------
from google.colab import files
print("-"*50)
print("📎 Upload your Hindi audio file (MP3, WAV, M4A)...")

uploaded_files = files.upload()
input_filename = list(uploaded_files.keys())[0]
print(f"\n✅ Uploaded '{input_filename}'. Starting pipeline...")
print("-"*50)

# Transcribe
original_text = transcribe_audio_hindi(input_filename)
with open("original_transcript_hindi.txt", "w", encoding="utf-8") as f: f.write(original_text)
print(f"Original Transcript: {original_text}")

# Redact
redacted_text = detect_and_redact_text_hindi(original_text)
with open("redacted_transcript_hindi.txt", "w", encoding="utf-8") as f: f.write(redacted_text)
print(f"\nRedacted Transcript: {redacted_text}")

# Voice-preserving TTS
final_audio = synthesize_speech_hindi(redacted_text)
output_filename = "resynthesized_redacted_audio_hindi.wav"
# The MMS model's output sample rate is available in its config
samplerate = tts_model.config.sampling_rate
sf.write(output_filename, final_audio, samplerate)
print(f"\n✅ Audio saved as '{output_filename}'")
print("You can now download the generated files from the left file browser.")

✅ Environment setup complete!
🧠 Loading all AI models... (This may take several minutes)
   -> Loading Whisper for Speech-to-Text...


100%|█████████████████████████████████████| 1.42G/1.42G [00:19<00:00, 77.1MiB/s]


   -> Loading AI4Bharat NER model for PII Detection...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/667M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


   -> Loading MMS model for Text-to-Speech (Hindi)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

✅ All models loaded successfully!
--------------------------------------------------
📎 Upload your Hindi audio file (MP3, WAV, M4A)...


Saving download.mp3 to download.mp3

✅ Uploaded 'download.mp3'. Starting pipeline...
--------------------------------------------------

[Step 1/3] Transcribing audio in Hindi...




Original Transcript:  नमस्ते, मेरा नाम प्रिया शर्मा है। क्रिप्या दस्तावेज मेरे email privaya.sharma1238exampla.com पर भीजें। मेरा फोन नम्बर प्लस 919876543210 है। और मेरा आधार नम्बर 1234567890012 है।
[Step 2/3] Detecting PII and redacting Hindi text...

Redacted Transcript:  नमस्ते, मेरा नाम [PERSON][PERSON]्मा है। क्रिप्या दस्तावेज मेरे email privaya.sharma1238exampla.com पर भीजें। मेरा फोन नम्बर प्लस [AADHAAR]और मेरा आधार नम्बर 1234567890012 है।
[Step 3/3] Generating new Hindi audio...

✅ Audio saved as 'resynthesized_redacted_audio_hindi.wav'
You can now download the generated files from the left file browser.
