In [1]:
# %%
# # Batch Speech-to-Text Transcription using Wav2Vec2
#
# This notebook demonstrates how to:
#   1. Load all .wav files from a specified folder.
#   2. Transcribe each file using the Wav2Vec2 model.
#   3. Save each file's transcription to a text file.
#
# Folder with WAV files:
# "C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK"

# %%
# ## 1. Install Dependencies (if not already installed)
# Uncomment the following line and run if needed.
# !pip install torch torchaudio transformers

# %%
# ## 2. Import Libraries and Load the Model

import os
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC



  from .autonotebook import tqdm as notebook_tqdm
  warn(





In [2]:
# Load pretrained Wav2Vec2 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

model.eval()  # Set model to evaluation mode


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [3]:

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [4]:

# %%
# ## 3. Define Speech-to-Text Function Using torchaudio
def speech_to_text(audio_path):
    """
    Loads a WAV file using torchaudio, resamples to 16 kHz if needed,
    then transcribes it using the Wav2Vec2 model.
    
    Parameters:
      audio_path (str): Path to the WAV file.
    
    Returns:
      transcription (str): The predicted transcript.
    """
    # Load audio file (waveform and sample rate)
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # If stereo, convert to mono by averaging channels
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0)
    else:
        waveform = waveform.squeeze()
    
    # Resample if sample rate is not 16 kHz
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    
    # Convert waveform tensor to a numpy array
    audio_array = waveform.numpy()
    
    # Preprocess the audio for the model
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.to(device)
    
    # Run inference without computing gradients
    with torch.no_grad():
        logits = model(input_values).logits
    
    # Get predicted token IDs and decode them into text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    
    return transcription

# %%
# ## 4. Batch Transcription Function
def batch_transcribe_wav_files(input_folder, output_file):
    """
    Iterates over all .wav files in the specified folder,
    transcribes each one using the Wav2Vec2 model,
    and writes the filename and transcription to an output text file.
    
    Each line in the output file has the format:
      filename: transcription
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for file_name in os.listdir(input_folder):
            if file_name.lower().endswith('.wav'):
                wav_path = os.path.join(input_folder, file_name)
                print(f"[INFO] Transcribing: {wav_path}")
                transcript = speech_to_text(wav_path)
                print(transcript)
                f.write(f"{file_name}: {transcript}\n")
    print(f"[INFO] Finished transcribing. Transcriptions saved to: {output_file}")



In [5]:

# %%
# ## 5. Execute Batch Transcription

# Update the input_folder path if needed
input_folder = r"C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK"
output_file = "transcriptions.txt"

batch_transcribe_wav_files(input_folder, output_file)

[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a01.wav
SHE HAD YOUR DARK SUIT IN GREASY WASHWATER ALL YEAR
[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a02.wav
DON'T ASK ME TO CARRY AN OILY RAG LIKE THAT
[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a03.wav
WILL YOU TELL ME WHY
[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a04.wav
WHO AUTHORIZED THE UNLIMITED EXPENSE ACCOUNT
[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a05.wav
DESTROY EVERY FILE RELATED TO MY ORDITS
[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a06.wav
