## Install dependencies

In [1]:
!pip install -q demucs
!pip install -q transformers torchaudio librosa accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.1/87.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.7/249.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━

In [2]:
from demucs.apply import apply_model
from demucs.pretrained import get_model
from demucs.audio import AudioFile
import torchaudio
import torch
import os

## Noise reduction with demucs 

In [3]:
model = get_model(name='htdemucs')
# model.cpu()
model.eval()

Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to /root/.cache/torch/hub/checkpoints/955717e8-8726e21a.th
100%|██████████| 80.2M/80.2M [00:00<00:00, 109MB/s] 


BagOfModels(
  (models): ModuleList(
    (0): HTDemucs(
      (encoder): ModuleList(
        (0): HEncLayer(
          (conv): Conv2d(4, 48, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
          (norm1): Identity()
          (rewrite): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
          (norm2): Identity()
          (dconv): DConv(
            (layers): ModuleList(
              (0): Sequential(
                (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
                (1): GroupNorm(1, 6, eps=1e-05, affine=True)
                (2): GELU(approximate='none')
                (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
                (4): GroupNorm(1, 96, eps=1e-05, affine=True)
                (5): GLU(dim=1)
                (6): LayerScale()
              )
              (1): Sequential(
                (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
                (1): GroupNorm(1, 6, eps=1e-05, affine=True)
  

### Input path and output dir

In [4]:
input_audio_path = "/kaggle/input/whisper-test/whisper_test.m4a"
output_dir = "/kaggle/working/denoised"
os.makedirs(output_dir, exist_ok=True)

In [5]:
from IPython.display import Audio

Audio(input_audio_path)

### Read the audio file 

In [6]:
source = AudioFile(input_audio_path)
ref = source.read(streams=0, channels=1)
wav = ref[0]
sample_rate = source.samplerate()

In [7]:
print(wav.shape)
# Expected: (1, T) or (T,)

torch.Size([811008])


In [8]:
# Step 1: Ensure wav is a tensor with shape (1, T)
if not isinstance(wav, torch.Tensor):
    wav = torch.tensor(wav)

In [9]:
if wav.ndim == 1:
    wav = wav.unsqueeze(0)
print(wav.shape)

torch.Size([1, 811008])


In [10]:
if wav.shape[0] == 1:
    wav = torch.cat([wav, wav], dim=0)
print(wav.shape)

torch.Size([2, 811008])


In [11]:
wav = wav.unsqueeze(0).float()
print(wav.shape)

torch.Size([1, 2, 811008])


In [12]:
with torch.no_grad():
    sources = apply_model(model, wav)

In [13]:
vocals = sources[0][0]
vocals_path = os.path.join(output_dir, "vocals.wav")
torchaudio.save(vocals_path, vocals.cpu(), sample_rate)

print("✅ Denoising complete. Saved vocals to:", vocals_path)

✅ Denoising complete. Saved vocals to: /kaggle/working/denoised/vocals.wav


In [31]:
from IPython.display import Audio

Audio(vocals_path)

## Speech-to-text (model import)

In [14]:
!pip install -U "jax[cuda12]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
!pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
!pip install -U transformers datasets torchaudio

Looking in links: https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
Collecting jax[cuda12]
  Downloading jax-0.6.2-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib<=0.6.2,>=0.6.2 (from jax[cuda12])
  Downloading jaxlib-0.6.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting ml_dtypes>=0.5.0 (from jax[cuda12])
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Collecting jax-cuda12-plugin<=0.6.2,>=0.6.2 (from jax-cuda12-plugin[with-cuda]<=0.6.2,>=0.6.2; extra == "cuda12"->jax[cuda12])
  Downloading jax_cuda12_plugin-0.6.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.7 kB)
Collecting jax-cuda12-pjrt==0.6.2 (from jax-cuda12-plugin<=0.6.2,>=0.6.2->jax-cuda12-plugin[with-cuda]<=0.6.2,>=0.6.2; extra == "cuda12"->jax[cuda12])
  Downloading jax_cuda12_pjrt-0.6.2-py3-none-manylinux2014_x86_64.whl.metadata (579 bytes)
Collecting nvidia-cuda-nvcc-cu12>=12.6.85 (from jax-cuda12-plugin[with-cuda]<=0.6.2,>

In [16]:
!pip uninstall torch torchaudio -y
!pip install torch==2.2.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121

Found existing installation: torch 2.7.1
Uninstalling torch-2.7.1:
  Successfully uninstalled torch-2.7.1
Found existing installation: torchaudio 2.7.1
Uninstalling torchaudio-2.7.1:
  Successfully uninstalled torchaudio-2.7.1
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.2.2
  Downloading https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp311-cp311-linux_x86_64.whl (757.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m757.3/757.3 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hCollecting torchaudio==2.2.2
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.2.2%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-no

In [18]:
!pip install -U transformers



In [20]:
!pip show transformers

Name: transformers
Version: 4.53.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: kaggle-environments, peft, sentence-transformers, whisper_jax


In [23]:
import torchaudio
import jax.numpy as jnp
import jax

In [27]:
!pip install --upgrade --force-reinstall transformers torchaudio --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m763.0/763.0 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.7/792.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━

In [30]:
!pip install --upgrade --no-cache-dir --force-reinstall transformers==4.39.3 --target=/kaggle/working/libs

Collecting transformers==4.39.3
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers==4.39.3)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.39.3)
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers==4.39.3)
  Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m266.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from transformers==4.39.3)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers==4.39.3)
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.met

In [33]:
!pip uninstall -y transformers
!pip install transformers==4.36.2 torchaudio --quiet

Found existing installation: transformers 4.53.0
Uninstalling transformers-4.53.0:
  Successfully uninstalled transformers-4.53.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
whisper-jax 0.0.1 requires transformers<4.35.0,>=4.27.4, but you have transformers 4.36.2 which is incompatible.
sentence-transformers 3.4.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.36.2 which is incompatible.[0m[31m
[0m

In [41]:
!pip install transformers==4.36.2 --target=/kaggle/working/custom_transformers

Collecting transformers==4.36.2
  Using cached transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
Collecting filelock (from transformers==4.36.2)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.36.2)
  Using cached huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers==4.36.2)
  Using cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting packaging>=20.0 (from transformers==4.36.2)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers==4.36.2)
  Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers==4.36.2)
  Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting requests (from transformers==4.36.2)
  Using cached requests

In [42]:
import sys
sys.path = ["/kaggle/working/custom_transformers"] + [p for p in sys.path if "transformers" not in p]

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v2")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2")


In [None]:
# 1. Load model and processor
model_id = "parthiv11/indic_whisper_nodcil"
processor = AutoProcessor.from_pretrained(model_id)
model = FlaxWhisperForConditionalGeneration.from_pretrained(model_id)

## Summarization using LLM

In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load model and tokenizer
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
pegasus = PegasusForConditionalGeneration.from_pretrained(model_name)

def summarize_text(text):
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = pegasus.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize transcription
summary = summarize_text(transcription)
print("📌 Summary:\n", summary)
