In [11]:
!pip install ffmpeg-python librosa torch torchaudio transformers



In [12]:
# %%
# Title: MP3 to Text using Wav2Vec2 (with separate MP3→WAV conversion)

# %%
# 1. Install Dependencies (Uncomment and run if needed)

import os
import ffmpeg
import torch
import librosa

from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC
)


In [13]:
# %%
# 2. Separate Function to Convert MP3 → WAV
def convert_mp3_to_wav(mp3_path, wav_path):
    """
    Converts an MP3 file to a single-channel (mono) WAV at 16 kHz 
    using ffmpeg.
    """
    (
        ffmpeg
        .input(mp3_path)
        .output(
            wav_path,
            format='wav',       # output format
            acodec='pcm_s16le', # audio codec
            ac=1,               # number of channels
            ar='16000'          # audio sampling rate
        )
        .run(overwrite_output=True)
    )
    return wav_path


In [14]:
# %%
# 3. Load Wav2Vec2 Model & Processor
#    We'll use the facebook/wav2vec2-large-960h checkpoint
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Set model to evaluation mode
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [21]:
!pip install --upgrade numba


^C


In [15]:
# %%
# 4. Speech-to-Text Function
def speech_to_text(audio_path):
    """
    Loads a WAV audio file at 16 kHz using librosa,
    then transcribes it using a Wav2Vec2 model.
    """
    # 1. Load the audio with librosa
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)

    # 2. Tokenize/preprocess with the processor
    inputs = processor(
        speech_array,
        sampling_rate=16000,
        return_tensors="pt"
    )
    input_values = inputs.input_values.to(device)

    # 3. Inference
    with torch.no_grad():
        logits = model(input_values).logits

    # 4. Decode predicted tokens
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

In [19]:
# %%
# 5. Demo Usage: Converting an MP3 file to WAV, then Transcribing
if __name__ == "__main__":
    # Provide the MP3 file path
    mp3_file = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/SpeechText/speech-text-conversion/assets/sample1.wav"        # Update with your file
    wav_file = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/SpeechText/speech-text-conversion/assets/sample1.wav"

    if os.path.exists(mp3_file):
        print(f"[INFO] Converting MP3 to WAV: {mp3_file} → {wav_file}")
        convert_mp3_to_wav(mp3_file, wav_file)
        
        print("[INFO] Transcribing the WAV file...")
        transcription = speech_to_text(wav_file)
        print("\n[RESULT] Transcription:")
        print(transcription)
        
        # (Optional) remove the WAV if you no longer need it
        # os.remove(wav_file)
    else:
        print(f"[WARNING] MP3 file not found: {mp3_file}")

[INFO] Converting MP3 to WAV: C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/SpeechText/speech-text-conversion/assets/sample1.wav → C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/SpeechText/speech-text-conversion/assets/sample1.wav


FileNotFoundError: [WinError 2] The system cannot find the file specified

In [20]:
import os
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# -------------------------------------------------------------------
# 1. Load Wav2Vec2 Model and Processor
# -------------------------------------------------------------------
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def speech_to_text(audio_path):
    """
    Loads a WAV file at 16 kHz using librosa,
    then transcribes it using the Wav2Vec2 model.
    """
    # Load audio at 16 kHz
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)

    # Preprocess audio
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.to(device)

    # Inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode predicted tokens
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

def batch_transcribe_wav_files(input_folder, output_file):
    """
    Iterates over all .wav files in `input_folder`,
    transcribes them, and saves each result to `output_file`.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for file_name in os.listdir(input_folder):
            if file_name.lower().endswith('.wav'):
                wav_path = os.path.join(input_folder, file_name)
                print(f"[INFO] Transcribing: {wav_path}")

                transcript = speech_to_text(wav_path)
                f.write(f"{file_name}: {transcript}\n")

    print(f"\n[INFO] Finished transcribing all WAV files.")
    print(f"[INFO] Results have been saved to: {output_file}")

if __name__ == "__main__":
    # Path where your WAV files are stored
    input_folder = r"C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK"
    
    # The file where transcriptions will be written
    output_file = "transcriptions.txt"
    
    # Perform batch transcription
    batch_transcribe_wav_files(input_folder, output_file)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Transcribing: C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK\a01.wav


AttributeError: module 'numba' has no attribute 'core'

In [23]:
import os
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# -------------------------------------------------------------------
# 1. Speech-to-text with Wav2Vec2
# -------------------------------------------------------------------
# Load the pretrained model and processor once
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

def speech_to_text(audio_path):
    """
    Loads a WAV file (or other supported audio format) at 16 kHz,
    then transcribes it using a Wav2Vec2 model.
    """
    # Load the audio at 16 kHz
    speech_array, _ = librosa.load(audio_path, sr=16000)

    # Preprocess the raw waveform
    input_values = processor(
        speech_array, 
        sampling_rate=16000, 
        return_tensors="pt"
    ).input_values

    # Inference (no gradient needed)
    with torch.no_grad():
        logits = model(input_values).logits

    # Argmax to get predicted token IDs
    predicted_ids = torch.argmax(logits, dim=-1)
    # Decode to text
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# -------------------------------------------------------------------
# 2. Batch Transcription Function
# -------------------------------------------------------------------
def batch_transcribe_wav_files(input_folder, output_file):
    """
    Searches for all .wav files in `input_folder`, transcribes each file,
    and writes the results to `output_file`.
    
    Each line in the output file will have the format:
        filename: transcription
    """
    # Create or overwrite the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        # Iterate through everything in the folder
        for file_name in os.listdir(input_folder):
            if file_name.lower().endswith(".wav"):
                wav_path = os.path.join(input_folder, file_name)
                print(f"[INFO] Transcribing: {wav_path}")

                # Perform speech-to-text
                transcript = speech_to_text(wav_path)

                # Write to the .txt file in a simple "filename: transcript" format
                f.write(f"{file_name}: {transcript}\n")
        print(f"[INFO] Finished batch transcription. Results saved to {output_file}")

# -------------------------------------------------------------------
# 3. Example Usage (Uncomment to run)
# -------------------------------------------------------------------
if __name__ == "__main__":
    # Folder containing WAV files
    input_folder = r"C:\Users\DELL\Desktop\VOIP_Phishing_Attacks\Repos\SpeechText\speech-text-conversion\assets\AudioData\JK"
    
    # Where to save the transcriptions
    output_file = "transcriptions.txt"
    
    # Call the batch transcription function
    batch_transcribe_wav_files(input_folder, output_file)
