In [1]:
import numpy as np
import torch
import librosa

In [2]:
# For using Hugging Face Transformers with Whisper
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("BlueRaccoon/whisper-small-en")
model = AutoModelForSpeechSeq2Seq.from_pretrained("BlueRaccoon/whisper-small-en")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [2]:
def load_audio_chunks(file_path, chunk_duration=30, sample_rate=16000, overlap=1.0):
    audio, sr = librosa.load(file_path, sr=sample_rate)
    chunk_samples = int(chunk_duration * sample_rate)
    step = int((chunk_duration - overlap) * sample_rate)

    chunks = []
    for start in range(0, len(audio), step):
        end = start + chunk_samples
        if end > len(audio):
            chunk = np.pad(audio[start:], (0, end - len(audio)))
        else:
            chunk = audio[start:end]
        chunks.append(chunk)
    return chunks

In [3]:
def transcribe_chunks(chunks):
    full_transcript = ""
    for i, chunk in enumerate(chunks):
        inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        predicted_ids = model.generate(inputs)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        print(f"Chunk {i+1}/{len(chunks)}: {transcription}")
        full_transcript += transcription.strip() + " "
    return full_transcript.strip()

In [7]:
transcript = "Iran Supreme Leader Ayatollah Ali Hami  has warned Washington of irreparable  consequences if the US becomes directly  involved in Israel's military campaign  against Tehran. It's understood that  President Trump is considering joining  the attacks on Iran's nuclear sites.  Here's our diplomatic correspondent  Caroline Hol.  For a sixth straight night, Israel kept  up its unprecedented assault on Iran.  These images captured by an Iranian  television station show air defenses in  action above Tehran trying to stop  incoming Israeli fire. This is a  building hit in the capital. We don't  know the exact target, but Israel says  that overnight it hit a university  linked to Iran's powerful revolutionary  guards as well as facilities linked to  its missile and nuclear program.  These satellite images show a military  base in Tabre in the northwest of the  country. here before and here after it  was struck.  Donald Trump, who left a meeting of G7  leaders early, has called for Iran's  unconditional surrender and is reported  to be considering joining Israel's  strikes on its nuclear facilities. He's  described the country's supreme leader  as an easy target.  Now, a defiant response from Ayatah Ali  Ki, his words read out by a presenter on  staterun TV. Iranians, he said, are not  people who surrender, and any form of US  military intervention would be met with  what he called irreparable harm.  For civilians in Tehran, these are  terrifying times. Shops shut, quiet  streets. Many people are holed up at  home, hiding, we're told, in basement  and garages. Many others have already  fled the capital. Among them, Nagis  Muhammadi, winner of the Nobel Peace  Prize, who spent many years in Iran's  jails.  I witnessed horrendous attacks. They're  hitting infrastructure, a lot of  civilian deaths. We have a misogynist  theocracy in Iran with the Supreme  Leader Ali K on the top who took us to  hell whilst promising heaven. And at the  same time, now Netanyahu is also taking  us to hell.  Iran is still firing at Israel, but not  at the same rate as before. Overnight,  missiles were intercepted over Tel Aviv.  Some limited damage reported, but no  more casualties.  In Cyprus, two repatriation flights were  laid on to bring Israelis stranded  abroad back home. Nobody knows how long  the war will go on for, how it might  escalate or end. And so many Iranians  and foreign nationals are escaping Iran  if they can. This the border with  Pakistan, but many others are trapped  where they are. Everyone afraid of what  could come next. Caroline Holy, BBC  News.  Well, in a moment we'll speak with our  Middle East correspondent, Hugo Bacha,  who's in Jerusalem this lunchtime. But  first to our North America  correspondents in Washington, Nomia  Ikbal. and uh Nomia, the war of words is  ramping up. Here we are on day six. But  what exactly do we know about America's  intentions? How far will they go?  Well, the world is waiting to see what  President Trump will do next. We've not  had any reaction from the White House so  far. There's been no flurry of social  media posts from him like we saw  yesterday, although uh that could happen  at any moment. Uh there are reports that  the US is leaning towards involvement  rather than diplomacy and that would  involve sending US troops to the region  to use those bunker buster bombs to  destroy Iran's nuclear capabilities deep  underground. But make no mistake, if  that is what the US does, it is a  profoundly defining moment of Trump's  presidency. Seismic. This is a man who  has long campaigned for no foreign wars,  attacked Barack Obama, accusing him of  trying to start a war with Iran. and  only back in May said in Saudi Arabia  that the era of foreign intervention was  over. So lots of his supporters will be  saying what happened to that man. Could  there be regime change? Well, Donald  Trump also said yesterday threatened to  potentially kill the Supreme Leader. So  we just don't know uh what could happen  next and bear in mind if the US does get  involved, how could Iran react to it?  Again, we don't know. And it could put  the US in a very very dangerous moment.  Nomia, thank you. Well, let's get the  picture then from Jerusalem. Our  correspondent, Hugo Bichega, is there on  the ground, Hugo, how will these words  have been received and what's the latest  on the conflict from that perspective?  Yeah, Dina. So, Israel has continued to  attack Iran. The military said more than  50 planes hit about 20 targets across  Tehran overnight, including weapons  production facilities. And on the ground  in Iran, we've seen those those huge  traffic jams of residents trying to flee  uh the city uh Thran uh because of fears  of an escalation of this conflict.  Shortages of petrol and food are being  felt. Also some cash machines are not  working. So this is the impact on the  population there. Now in Israel, as  Caroline said, there was a wave of  attacks, two waves actually of Iranian  missile attacks overnight. Uh the  Iranians said they had used for the  first time a hypersonic missile, which  is a missile that is more difficult to  be intercepted by air defenses. But  there have been no reports of  casualties. And we're seeing that the  recent attacks haven't been as intense,  which is probably an indication of the  impact of these Israeli air strikes on  the Iranian military. And just moments  ago, the Israeli government announced  that some of the restrictions that had  been imposed because of the war are now  being eased. Uh there will be you know  limited gatherings are now being allowed  but crucially schools and uh the  airspace remain closed but this is an  indication that the Israeli authorities  believe that the threat coming from Iran  has diminished. Okay, Hugo in Jerusalem.  Thank you."

In [10]:
import os
import subprocess

import os
import subprocess

def convert_to_wav(input_file, output_file, sample_rate=16000):
    """
    Convert an audio or video file to WAV format with specified sample rate and mono channel.
    
    Args:
        input_file (str): Path to the input audio or video file.
        output_file (str): Path to the output WAV file.
        sample_rate (int): Target sample rate (default: 16000 Hz).
    
    Raises:
        subprocess.CalledProcessError: If ffmpeg conversion fails.
        FileNotFoundError: If input file does not exist.
    """
    # Validate input file
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file {input_file} does not exist.")
    
    # Determine file type based on extension
    file_ext = os.path.splitext(input_file)[1].lower()
    audio_extensions = ['.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma']
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
    
    if file_ext not in audio_extensions + video_extensions:
        raise ValueError(f"Unsupported file format: {file_ext}. Supported formats: {', '.join(audio_extensions + video_extensions)}")
    
    command = [
        "ffmpeg",
        "-y",  # Overwrite output file if it exists
        "-i", input_file,
        "-ar", str(sample_rate),  # Resample to 16kHz
        "-ac", "1",               # Convert to mono channel
        "-acodec", "pcm_s16le",   # WAV format
        output_file
    ]
    
    # For video files, add -vn to disable video stream
    if file_ext in video_extensions:
        command.insert(2, "-vn")  # Insert after "-i" to disable video
    
    try:
        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        error_msg = f"ffmpeg conversion failed: {e.stderr.decode()}"
        raise RuntimeError(error_msg)
    except FileNotFoundError:
        raise RuntimeError("ffmpeg is not installed or not found in system PATH. Install ffmpeg and add it to PATH.")

# Example usage
try:
    # for audio
    # convert_to_wav("../data/raw/sample2.mp3", "../data/raw/sample2.wav")
    # Example with video
    convert_to_wav("../data/raw/nepali.mp4", "../data/raw/nepali.wav")
except Exception as e:
    print(f"Error: {e}")

In [11]:
import noisereduce as nr
import librosa
import soundfile as sf

def denoise_audio(input_path, output_path):
    # Load audio
    y, sr = librosa.load(input_path, sr=None)
    
    # Estimate noise from the first 0.5 second (adjust if needed)
    noise_sample = y[:int(sr * 0.5)]
    
    # Reduce noise
    reduced_noise = nr.reduce_noise(y=y, sr=sr, y_noise=noise_sample)
    
    # Save to new file
    sf.write(output_path, reduced_noise, sr)
    print(f"Saved denoised audio to: {output_path}")
    
# Example usage
denoise_audio("../data/raw/nepali.wav", "../data/cleaned/nepali_denoised.wav")

Saved denoised audio to: ../data/cleaned/nepali_denoised.wav


In [None]:
# Path to your audio file
file_path = "../data/cleaned/sample2_denoised.wav"  # update this path accordingly

chunks = load_audio_chunks(file_path, chunk_duration=30, overlap=1.5)

# Transcribe all chunks
final_text = transcribe_chunks(chunks)
print("\n📝 Full Transcription:\n", final_text)

`generation_config` default values have been modified to match model-specific defaults: {'begin_suppress_tokens': [220, 50256]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> to see related `.generate()` flags.


Chunk 1/14: Iran's supreme leader Ayatollah Ali Khamenei has warned Washington of "irreparable consequences" if the U.S. becomes directly involved in Israel's military campaign against Tehran. It is understood that President Trump is considering joining the attacks on Iran's nuclear sites. Here's our diplomatic correspondent Caroline Hawley.
Chunk 2/14: These satellite images show a military base in Tabriz in the northwest of the country, here before.
Chunk 3/14: Now a defiant response from Ayatollah Ali Khamenei (his words read out by a presenter on state-run TV). Iranian president
Chunk 4/14: For civilians in Tehran, these are "terrifying times", "shops shut", "quiet streets" many people are held up at home, hiding, we're told, in basements and garages many others have already fled the capital, among them Nagis Muhammadi, winner of the Nobel Peace Prize, who spent many years in prison for his death
Chunk 5/14: The Hiren is still firing at Israel but not at the same rate as before.
Ch

In [12]:
# calculate WER
import jiwer
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)
# Example usage
reference_text = transcript  # The original text
hypothesis_text = final_text  # The transcribed text
wer = calculate_wer(reference_text, hypothesis_text)
print(f"Word Error Rate (WER): {wer:.4f}")

Word Error Rate (WER): 0.5864


In [6]:
# for nepali language transcription
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("amitpant7/Nepali-Automatic-Speech-Recognition")
model = AutoModelForSpeechSeq2Seq.from_pretrained("amitpant7/Nepali-Automatic-Speech-Recognition")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [12]:
# Path to your audio file
file_path = "../data/cleaned/nepali_denoised.wav"  # update this path accordingly

chunks = load_audio_chunks(file_path, chunk_duration=30, overlap=1.5)

# Transcribe all chunks
final_text = transcribe_chunks(chunks)
print("\n📝 Full Transcription:\n", final_text)

Chunk 1/1: आज कणाली गण्टकी बाग्मती मदेश र कुषि प्रदेशमा आम्शिक बद्नि रही बाकी बुवाव मौसम सावनीतय सकानी र हनी मौसम्बितको भनाइ राती बने कोषी मदेश बाग्मति र कणाली प्रदेश माम्शिक बद्नि रही कदानी प्रसाको सम्भादी तयसमाने र हनी मौसम्बितको भनाइ राती बने कोसी मदेश बाग्मति र कणाली प्रदेश माम्चिको आम्म्शिक बद्दिरही कदानी प्यो तलका बसाको सम्ब्भाग्ना

📝 Full Transcription:
 आज कणाली गण्टकी बाग्मती मदेश र कुषि प्रदेशमा आम्शिक बद्नि रही बाकी बुवाव मौसम सावनीतय सकानी र हनी मौसम्बितको भनाइ राती बने कोषी मदेश बाग्मति र कणाली प्रदेश माम्शिक बद्नि रही कदानी प्रसाको सम्भादी तयसमाने र हनी मौसम्बितको भनाइ राती बने कोसी मदेश बाग्मति र कणाली प्रदेश माम्चिको आम्म्शिक बद्दिरही कदानी प्यो तलका बसाको सम्ब्भाग्ना


In [13]:
nepali_transcript = " आज कर्णाली गण्डकी बागमती मदेश र कोशी प्रदेशमा आंशिक बदली रही बाकि भूभागमा मौसम सामान्यतः सफानै रहने मौसंबिद्को भनाइ राती भने कोशी मदेश बागमती र कर्णाली प्रदेशमा मौसम आंशिक बदली रही कर्णाली प्रदेशको पहाडी एक दुइ ठाउमा  मेघ गर्जन र चत्यंग सहित हल्का बर्सा को सम्भाभाना ।  "

In [15]:
# calculate WER for nepali
import jiwer
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)
# Example usage
reference_text = nepali_transcript  # The original text
hypothesis_text = final_text  # The transcribed text
wer = calculate_wer(reference_text, hypothesis_text)
print(f"Word Error Rate (WER): {wer:.4f}")

Word Error Rate (WER): 0.9362
