# Test Audio Analyse

## Version Whisper de base 

- Vitesse Rapide
- Précision moyenne
- Utile pour test générale

##### Code

In [None]:
import whisper
import os
import re


# Load the Whisper model
model = whisper.load_model("base")

# Path to the audio file
audio_file = "bee.mp4"

# Check if the file exists
if not os.path.exists(audio_file):
	raise FileNotFoundError(f"The file '{audio_file}' was not found. Please check the path.")

# Transcribe the audio file
result = model.transcribe(audio_file)

# Print the transcription
print(result["text"])

# Split the transcription into sentences based on punctuation

# Define a function to split text into sentences
def split_into_sentences(text):
	# Use regex to split by sentence-ending punctuation
	sentences = re.split(r'(?<=[.!?]) +', text)
	return sentences

# Get the segments from the result
segments = result.get("segments", [])

# Initialize a list to store formatted sentences
formatted_sentences = []

# Iterate through the segments to add line breaks based on silence duration
for segment in segments:
	text = segment["text"].strip()
	if segment["end"] - segment["start"] > 1:  # Check if silence duration is greater than 2 seconds
		formatted_sentences.append(text + "\n")
	else:
		formatted_sentences.append(text)

# Join the sentences with line breaks
formatted_text = "\n".join(formatted_sentences)

# Save the formatted transcription to a text file
output_file = os.path.splitext(audio_file)[0] + ".txt"
with open(output_file, "w", encoding="utf-8") as f:
	f.write(formatted_text)

print("Formatted transcription saved to:", output_file)

 Go ahead and introduce your sign name is Michael with a B and I've been afraid of insects mind Stop, stop, stop, where? Hmm? Where's the B? There's the B?
Formatted transcription saved to: bee.txt


## Version Whisper large V3 Turbo

- Vitesse lente
- Haute précision
- Utile pour longue vidéo

#### V1 fonctionnelle
- Ne marche pas sur les longues vidéo

#### Code

In [None]:
import os
import re
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

audio_file_turbo = "bee.mp4"
resultturbo = pipe(audio_file_turbo, return_timestamps=True)
print(resultturbo["text"])

# Split the transcription into sentences based on punctuation

# Define a function to split text into sentences
def split_into_sentences(text):
	# Use regex to split by sentence-ending punctuation
	sentences = re.split(r'(?<=[.!?]) +', text)
	return sentences

# Get the chunks from the result
chunks = resultturbo.get("chunks", [])


# Initialize a list to store formatted sentences
formatted_sentences = []

# Iterate through the chunks to add line breaks based on silence duration
for chunk in chunks:
	text = chunk["text"].strip()
	start, end = chunk["timestamp"]
	if end - start > 1:  # Check if silence duration is greater than 1 seconds
		formatted_sentences.append(text + "\n")
	else:
		formatted_sentences.append(text)

# Join the sentences with line breaks
formatted_text = "\n".join(formatted_sentences)

# Save the formatted transcription to a text file
output_file = os.path.splitext(audio_file_turbo)[0] + "turbo.txt"
if formatted_text.strip():
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(formatted_text)	
    print("Formatted transcription saved to:", output_file)
else:
    print("Warning: Formatted text is empty. Nothing was written.")

Device set to use cpu


 Go ahead and introduce yourself. My name is Michael with a B, and I've been afraid of insects. Stop, stop, stop. Where? Hm? Where's the B? There's a B?
Formatted transcription saved to: beeturbo.txt
Formatted transcription saved to: beeturbo.txt


#### V2 Amélioré
- Marche mieux pour les vidéo longues
- Met plus de temps que le modèle basique (40 min pour une vidéo de 25min)

#### Code

##### Version simple audio

In [12]:
import os
import re
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

input_dir = "audios"
audio_file_turbo = os.path.join(input_dir, "riendire.mp4")
resultturbo = pipe(audio_file_turbo, return_timestamps=True)
print(resultturbo["text"])

# Split the transcription into sentences based on punctuation

# Define a function to split text into sentences
def split_into_sentences(text):
	# Use regex to split by sentence-ending punctuation
	sentences = re.split(r'(?<=[.!?]) +', text)
	return sentences

# Get the chunks from the result
chunks = resultturbo.get("chunks", [])


# Initialize a list to store formatted sentences
formatted_sentences = []

# Iterate through the chunks to add line breaks based on silence duration
for chunk in chunks:
    text = chunk["text"].strip()
    start, end = chunk.get("timestamp", (None, None))
    if start is not None and end is not None and end - start > 1:  # Check if silence duration is greater than 1 second
        formatted_sentences.append(text + "\n")
    else:
        formatted_sentences.append(text)

# Join the sentences with line breaks
formatted_text = "\n".join(formatted_sentences)

# Save the formatted transcription to a text file
output_file = os.path.splitext(audio_file_turbo)[0] + "turbo.txt"
if formatted_text.strip():
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(formatted_text)	
    print("Formatted transcription saved to:", output_file)
else:
    print("Warning: Formatted text is empty. Nothing was written.")

Device set to use cpu


 Ça, en vrai, vraiment, vraiment ça. Je te le dis. Bref, mets-la en vas-y, c'est bon. Voilà, il y a 20 mots, t'as rien dit. Comment c'est possible d'écrire autant et de rien dire ? Voilà, c'est...
Formatted transcription saved to: audios\riendireturbo.txt


##### Version simple fichier

In [13]:
import os
import re
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

def transcribe_folder_with_whisper(input_dir="audios", extension_filter=(".mp3", ".wav", ".mp4", ".flac")):
    """
    Transcrit tous les fichiers audio dans un dossier donné à l'aide de Whisper v3 Turbo.
    
    Args:
        input_dir (str): Dossier contenant les fichiers audio.
        extension_filter (tuple): Extensions de fichiers à traiter.
    """
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3-turbo"

    print("Chargement du modèle...")
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    ).to(device)
    
    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )

    audio_files = [f for f in os.listdir(input_dir) if f.lower().endswith(extension_filter)]
    if not audio_files:
        print("Aucun fichier audio trouvé dans le dossier spécifié.")
        return

    for file in audio_files:
        file_path = os.path.join(input_dir, file)
        print(f"\n🗣️ Transcription de : {file} ...")

        result = pipe(file_path, return_timestamps=True)
        text = result.get("text", "").strip()
        chunks = result.get("chunks", [])

        # Formater avec sauts de ligne après pauses longues
        formatted_sentences = []
        for chunk in chunks:
            chunk_text = chunk["text"].strip()
            start, end = chunk.get("timestamp", (None, None))
            if start is not None and end is not None and end - start > 1:
                formatted_sentences.append(chunk_text + "\n")
            else:
                formatted_sentences.append(chunk_text)
        
        formatted_text = "\n".join(formatted_sentences)
        output_file = os.path.splitext(file_path)[0] + "_transcription.txt"
        
        if formatted_text:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(formatted_text)
            print(f"✅ Transcription enregistrée : {output_file}")
        else:
            print(f"⚠️ Aucun texte détecté pour : {file}")

# Exemple d'utilisation :
transcribe_folder_with_whisper("audios")


Chargement du modèle...


Device set to use cpu



🗣️ Transcription de : bee.mp4 ...
✅ Transcription enregistrée : audios\bee_transcription.txt

🗣️ Transcription de : Floor.mp4 ...
✅ Transcription enregistrée : audios\Floor_transcription.txt

🗣️ Transcription de : Perenoel.mp4 ...
✅ Transcription enregistrée : audios\Perenoel_transcription.txt

🗣️ Transcription de : plion_dream.wav ...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


✅ Transcription enregistrée : audios\plion_dream_transcription.txt

🗣️ Transcription de : riendire.mp4 ...
✅ Transcription enregistrée : audios\riendire_transcription.txt


## Version Nvidia Parakeet

- Audio en mono uniquement
- Pas très rapide
- Pas très performant


#### Code

In [9]:
# Import the ASR module from NeMo
import nemo.collections.asr as nemo_asr
import logging

logging.getLogger('nemo').setLevel(logging.WARNING)



# Load the ASR model
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")

from pydub import AudioSegment

# Path to the audio file
audio_file = "bee.mp4"

# Convert the audio file to mono and save it as a temporary file
audio = AudioSegment.from_file(audio_file)
audio = audio.set_channels(1)  # Convert to mono
temp_audio_file = "temp_mono_audio.wav"
audio.export(temp_audio_file, format="wav")

# Transcribe the audio file (ensure audio is in the correct format)
output = asr_model.transcribe([temp_audio_file], timestamps=True)

# Print transcribed text
print(output[0].text)  # Access the 'text' attribute of the Hypothesis object

# Extract segment timestamps (start and end times for each segment)
if hasattr(output[0], "word_timestamps"):
    word_timestamps = output[0].word_timestamps  # Word-level timestamps

    # Print word timestamps
    for word in word_timestamps:
        print(f"{word['start']}s - {word['end']}s : {word['word']}")  # Access attributes of the word dictionary
else:
    print("Segment timestamps are not available in the output.")


[NeMo I 2025-05-09 14:47:13 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-05-09 14:47:14 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_lhotse: true
    skip_missing_manifest_entries: true
    input_cfg: null
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    num_workers: 2
    pin_memory: true
    max_duration: 40.0
    min_duration: 0.1
    text_field: answer
    batch_duration: null
    use_bucketing: true
    bucket_duration_bins: null
    bucket_batch_size: null
    num_buckets: 30
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2025-05-09 14:47:14 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config :

[NeMo I 2025-05-09 14:47:14 nemo_logging:393] PADDING: 0
[NeMo I 2025-05-09 14:47:19 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-05-09 14:47:19 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-05-09 14:47:19 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-05-09 14:47:19 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-05-09 14:47:19 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-05-09 14:47:21 nemo_logging:393] Model EncDecRNNTBPEModel was successfully restored from C:\Users\pc\.cache\huggingface\hub\models--nvidia--parakeet-tdt-0.6b-v2\snapshots\50aec6a056e85b9f95b612df08a2bddc55b58714\parakeet-tdt-0.6b-v2.nemo.
[NeMo I 2025-05-09 14:47:22 nemo_logging:393] Timestamps requested, setting decoding timestamps to True. Capture them in Hypothesis object,                         with output[0][idx].timestep['word'/'segment'/'char']
[NeMo I 2025-05-09 14:47:22 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-05-09 14:47:22 nemo_logging:405] `include_duration` is not implemented for CUDA graphs
Transcribing: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it]

Go ahead and introduce yourself. My name is Michael with a bee and I've been afraid of insects mindset. Top stops out. Where? Hmm? Where's the bee? Where's the bee?
Segment timestamps are not available in the output.



