# Install Required Packages


In [10]:
pip install datasets transformers torch


Note: you may need to restart the kernel to use updated packages.


# Import Necessary Libraries


In [11]:
import pandas as pd
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset, Audio
from transformers import AutoProcessor, SeamlessM4TModel, Seq2SeqTrainingArguments, Seq2SeqTrainer


2024-05-19 04:29:44,245 INFO -- datasets: PyTorch version 2.2.2 available.


# Load and Prepare Data
We will load the CSV files into pandas dataframes, add full paths to the filenames, and convert the dataframes to Hugging Face datasets.


In [13]:
rescue_team_df = pd.read_csv('dataset_amazigh/annotations/rescue_team.csv')
small_talk_df = pd.read_csv('dataset_amazigh/annotations/small_talk.csv')

def preprocess_data(df):
    # Drop rows where 'filename' or 'translation_english' is NaN
    df = df.dropna(subset=['filename', 'translation_english'])
    return df

    
# Define the base directories for the audio files
rescue_base_dir = 'dataset_amazigh/wav/rescue_wav/'
small_talk_base_dir = 'dataset_amazigh/wav/conv_wav/'

rescue_team_df['filename'] = rescue_base_dir + rescue_team_df['filename']
small_talk_df['filename'] = small_talk_base_dir + small_talk_df['filename']


# Apply preprocessing
rescue_team_df = preprocess_data(rescue_team_df)
small_talk_df = preprocess_data(small_talk_df)

# Print first few rows to debug
print("Rescue Team DataFrame Sample:", rescue_team_df.head())
print("Small Talk DataFrame Sample:", small_talk_df.head())

Rescue Team DataFrame Sample:                                        filename    translation_arabic  \
0  dataset_amazigh/wav/rescue_wav/S1_resc_1.wav       هل الجميع بخير؟   
1  dataset_amazigh/wav/rescue_wav/S1_resc_2.wav  هل تحتاج إلى مساعدة؟   
2  dataset_amazigh/wav/rescue_wav/S1_resc_3.wav           أين إصابتك؟   
3  dataset_amazigh/wav/rescue_wav/S1_resc_4.wav     نحن هنا للمساعدة.   
4  dataset_amazigh/wav/rescue_wav/S1_resc_7.wav       نحتاج إلى حملك.   

      translation_english  
0       Is everyone okay?  
1       Do you need help?  
2  Where are you injured?  
3    We are here to help.  
4   We need to carry you.  
Small Talk DataFrame Sample:                                      filename        translation_arabic  \
0  dataset_amazigh/wav/conv_wav/S1_conv_1.wav         مرحبًا! كيف حالك؟   
1  dataset_amazigh/wav/conv_wav/S1_conv_2.wav    أنا بخير، شكرًا. وأنت؟   
2  dataset_amazigh/wav/conv_wav/S1_conv_3.wav                  ما اسمك؟   
3  dataset_amazigh/wav/conv_wav/S1

In [14]:

# Convert to Hugging Face datasets
def convert_to_dataset(df):
    return Dataset.from_pandas(df)

rescue_team_dataset = convert_to_dataset(rescue_team_df)
small_talk_dataset = convert_to_dataset(small_talk_df)

# Adding audio path to the datasets
rescue_team_dataset = rescue_team_dataset.cast_column("filename", Audio(sampling_rate=16000))
small_talk_dataset = small_talk_dataset.cast_column("filename", Audio(sampling_rate=16000))

# Combine datasets
combined_train_dataset = concatenate_datasets([rescue_team_dataset, small_talk_dataset])
combined_test_dataset = concatenate_datasets([rescue_team_dataset, small_talk_dataset])

# Check combined dataset sizes
print(f"Combined Train Dataset size: {len(combined_train_dataset)}")
print(f"Combined Test Dataset size: {len(combined_test_dataset)}")

Combined Train Dataset size: 130
Combined Test Dataset size: 130


In [15]:

# Create DatasetDict
combined_dataset = DatasetDict({
    "train": combined_train_dataset,
    "test": combined_test_dataset  
})

# Print dataset info to debug
print("Combined Dataset:", combined_dataset)


Combined Dataset: DatasetDict({
    train: Dataset({
        features: ['filename', 'translation_arabic', 'translation_english'],
        num_rows: 130
    })
    test: Dataset({
        features: ['filename', 'translation_arabic', 'translation_english'],
        num_rows: 130
    })
})


# Define Tokenizer and Model
We will define the tokenizer and model using the Seamless M4T model from Hugging Face.


In [16]:

# Load the Seamless M4T processor and model
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
pip install librosa soundfile


Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install transformers datasets sentencepiece


Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install datasets transformers sentencepiece


Note: you may need to restart the kernel to use updated packages.


# Preprocess Data
We need to define a preprocessing function to tokenize the inputs and labels.
before that we need to make the audios in the same length by adding padding


In [21]:

import pandas as pd
from datasets import Dataset, concatenate_datasets, Audio
from transformers import AutoProcessor, Wav2Vec2ForCTC, TrainingArguments, Seq2SeqTrainer
import librosa
import numpy as np
# Use a valid model identifier from the Hugging Face model hub
model_name_or_path = 'facebook/wav2vec2-base-960h'

# Initialize the processor
processor = AutoProcessor.from_pretrained(model_name_or_path)

# Initialize the model
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path)

# Calculate lengths of all audio files
lengths = []

def calculate_length(examples):
    print(examples["filename"]['path'])
    audio_path = examples["filename"]['path']
    audio_array, _ = librosa.load(audio_path, sr=16000)
    lengths.append(len(audio_array))
    return examples

combined_dataset.map(calculate_length, batched=False)

# Compute statistics
min_length = min(lengths)
max_length = max(lengths)
mean_length = int(np.mean(lengths))

print(f"Min length: {min_length}")
print(f"Max length: {max_length}")
print(f"Mean length: {mean_length}")

# Define the chosen length for padding (for example, mean length)
chosen_length = max_length


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

dataset_amazigh/wav/rescue_wav/S1_resc_1.wav
dataset_amazigh/wav/rescue_wav/S1_resc_2.wav
dataset_amazigh/wav/rescue_wav/S1_resc_3.wav
dataset_amazigh/wav/rescue_wav/S1_resc_4.wav
dataset_amazigh/wav/rescue_wav/S1_resc_7.wav
dataset_amazigh/wav/rescue_wav/S1_resc_8.wav
dataset_amazigh/wav/rescue_wav/S1_resc_9.wav
dataset_amazigh/wav/rescue_wav/S1_resc_10.wav
dataset_amazigh/wav/rescue_wav/S1_resc_11.wav
dataset_amazigh/wav/rescue_wav/S1_resc_12.wav
dataset_amazigh/wav/rescue_wav/S1_resc_13.wav
dataset_amazigh/wav/rescue_wav/S1_resc_14.wav
dataset_amazigh/wav/rescue_wav/S1_resc_15.wav
dataset_amazigh/wav/rescue_wav/S1_resc_16.wav
dataset_amazigh/wav/rescue_wav/S1_resc_17.wav
dataset_amazigh/wav/rescue_wav/S1_resc_18.wav
dataset_amazigh/wav/rescue_wav/S1_resc_19.wav
dataset_amazigh/wav/rescue_wav/S1_resc_20.wav
dataset_amazigh/wav/rescue_wav/S1_resc_21.wav
dataset_amazigh/wav/rescue_wav/S1_resc_22.wav
dataset_amazigh/wav/rescue_wav/S1_resc_23.wav
dataset_amazigh/wav/rescue_wav/S1_resc_24

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

dataset_amazigh/wav/rescue_wav/S1_resc_1.wav
dataset_amazigh/wav/rescue_wav/S1_resc_2.wav
dataset_amazigh/wav/rescue_wav/S1_resc_3.wav
dataset_amazigh/wav/rescue_wav/S1_resc_4.wav
dataset_amazigh/wav/rescue_wav/S1_resc_7.wav
dataset_amazigh/wav/rescue_wav/S1_resc_8.wav
dataset_amazigh/wav/rescue_wav/S1_resc_9.wav
dataset_amazigh/wav/rescue_wav/S1_resc_10.wav
dataset_amazigh/wav/rescue_wav/S1_resc_11.wav
dataset_amazigh/wav/rescue_wav/S1_resc_12.wav
dataset_amazigh/wav/rescue_wav/S1_resc_13.wav
dataset_amazigh/wav/rescue_wav/S1_resc_14.wav
dataset_amazigh/wav/rescue_wav/S1_resc_15.wav
dataset_amazigh/wav/rescue_wav/S1_resc_16.wav
dataset_amazigh/wav/rescue_wav/S1_resc_17.wav
dataset_amazigh/wav/rescue_wav/S1_resc_18.wav
dataset_amazigh/wav/rescue_wav/S1_resc_19.wav
dataset_amazigh/wav/rescue_wav/S1_resc_20.wav
dataset_amazigh/wav/rescue_wav/S1_resc_21.wav
dataset_amazigh/wav/rescue_wav/S1_resc_22.wav
dataset_amazigh/wav/rescue_wav/S1_resc_23.wav
dataset_amazigh/wav/rescue_wav/S1_resc_24

In [22]:
chosen_length

87576

In [23]:

# Preprocessing function with padding
def preprocess_function(examples):
    audio_paths = examples["filename"]
    inputs = {"input_values": [], "labels": []}
    for audio_path, text in zip(audio_paths, examples["translation_english"]):
        # Load and process the audio file
        
        audio_array, sampling_rate = librosa.load(audio_path['path'], sr=16000)
        
        # Pad the audio array to the chosen length
        if len(audio_array) > chosen_length:
            audio_array = audio_array[:chosen_length]
        else:
            audio_array = np.pad(audio_array, (0, chosen_length - len(audio_array)), 'constant')
        
        input_values = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt",padding= True).input_values[0]
        inputs["input_values"].append(input_values.numpy())
        
        # Ensure the text input is in the correct format (str or List[str])
        if isinstance(text, str):
            text = [text]  # Convert single string to list
        elif isinstance(text, list) and isinstance(text[0], list):
            text = [' '.join(t) for t in text]  # Flatten nested lists
        
        # Process text input to labels
        labels = processor(text=text, return_tensors="pt", padding=True).input_ids[0]
        inputs["labels"].append(labels.numpy())
    
    return inputs

print("Before mapping preprocess_function")

Before mapping preprocess_function


In [24]:
try:
    tokenized_train_dataset = combined_train_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=combined_train_dataset.column_names
    )
    print("Training dataset preprocessing successful")
except Exception as e:
    print(f"Error during training dataset preprocessing: {e}")
    tokenized_train_dataset = None

try:
    tokenized_eval_dataset = combined_test_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=combined_test_dataset.column_names
    )
    print("Evaluation dataset preprocessing successful")
except Exception as e:
    print(f"Error during evaluation dataset preprocessing: {e}")
    tokenized_eval_dataset = None

print("After mapping preprocess_function")


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Training dataset preprocessing successful


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Evaluation dataset preprocessing successful
After mapping preprocess_function


In [25]:
combined_train_dataset

Dataset({
    features: ['filename', 'translation_arabic', 'translation_english'],
    num_rows: 130
})

# Define Training Arguments and Metrics
We will set the training arguments for fine-tuning the model and define Word Error Rate (WER) as the evaluation metric.


In [29]:
from transformers import AutoProcessor, Wav2Vec2ForCTC, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate


# Define the evaluation metric
wer_metric = evaluate.load("wer", trust_remote_code=True)

def compute_metrics(pred):
    pred_ids = pred.predictions.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_ids = pred.label_ids
    # Replace -100 in the labels as we can't decode them
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}




In [30]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from transformers import Wav2Vec2Processor
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


# Fine-Tune the Model
We will start the fine-tuning process using the trainer.


In [31]:

# Check if both datasets are successfully processed
if tokenized_train_dataset and tokenized_eval_dataset:
    # Initialize training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )
    print("before")
    # Initialize the data collator
    data_collator= DataCollatorCTCWithPadding(processor, padding= True)
    # Initialize the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=processor,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    print("after")


    # Start training
    trainer.train()

    # Evaluate the model
    results = trainer.evaluate()
    print(f"Word Error Rate: {results['eval_wer']:.2f}")
else:
    print("Dataset not available for preprocessing and training")



before
after




Epoch,Training Loss,Validation Loss,Wer
1,No log,3237.006348,1.007364
2,No log,2874.892334,1.0
3,No log,2657.45874,1.0


Word Error Rate: 1.00


# Save the Model
After fine-tuning, we will save the model and processor.


In [32]:
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-seamless-m4t")
processor.save_pretrained("./fine-tuned-seamless-m4t")

print("Model and processor saved successfully.")


Model and processor saved successfully.


# Testing with some wav files

In [34]:
from transformers import Wav2Vec2ForCTC, AutoProcessor

# Load the fine-tuned model and processor
model_path = "./fine-tuned-seamless-m4t"
model = Wav2Vec2ForCTC.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)
import torch
import librosa

# Function to translate audio files
def translate_audio(audio_path):
    # Load the audio file
    audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
    
    # Preprocess the audio file
    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt", padding="longest").input_values
    
    # Perform inference
    with torch.no_grad():
        logits = model(inputs).logits
    
    # Decode the predictions
    predicted_ids = torch.argmax(logits, dim=-1)
    translation = processor.batch_decode(predicted_ids)
    
    return translation[0]
# List of test audio files (paths from your dataset)
test_audio_files = [
    "dataset_amazigh/wav/conv_wav/S3_conv_1.wav",
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav"
]

# Translate and print the results
for audio_file in test_audio_files:
    translation = translate_audio(audio_file)
    print(f"Translation for {audio_file}: {translation}")


Translation for dataset_amazigh/wav/conv_wav/S3_conv_1.wav: SEDEMONICL ANCONTUGITE
Translation for dataset_amazigh/wav/rescue_wav/S1_resc_2.wav: IELESMAQITAN


In [35]:
%%capture
!pip install fairseq2
!pip install pydub sentencepiece
!pip install git+https://github.com/facebookresearch/seamless_communication.git

In [83]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy as np
import soundfile as sf
import torchaudio
import torch

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the fine-tuned model and processor from the specified local path
model_path = "./fine-tuned-seamless-m4t"
model = Wav2Vec2ForCTC.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)

# List of test audio files (paths from your dataset)
test_audio_files = [
    "dataset_amazigh/wav/conv_wav/S3_conv_1.wav",
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav"
]

# Function to resample audio to 16kHz if necessary
def resample_audio(audio_path, target_sr=16000):
    audio, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(audio)
    return audio, target_sr

# Function to perform direct Amazigh-to-Arabic translation using the loaded model and processor
def translate_audio(audio_path):
    # Load the audio file
    audio_array, sampling_rate = torchaudio.load(audio_path, normalize=True)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        audio_array = resampler(audio_array)
    
    # Preprocess the audio file
    inputs = processor(audio_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding="longest").input_values
    
    # Perform inference
    with torch.no_grad():
        logits = model(inputs).logits
    
    # Decode the predictions
    predicted_ids = torch.argmax(logits, dim=-1)
    translation = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    return translation[0]

# Translate and display the results
for audio_file in test_audio_files:
    print(f"Audio file: {audio_file}")
    display(Audio(audio_file, rate=16000, autoplay=False, normalize=True))

    # Perform direct translation from Amazigh to Arabic
    translation = translate_audio(audio_file)
    
    print(f"Translated text: {translation}")
    print()


Audio file: dataset_amazigh/wav/conv_wav/S3_conv_1.wav


Translated text: SEDAMONICO AN CONTUGIT

Audio file: dataset_amazigh/wav/rescue_wav/S1_resc_2.wav


Translated text: IEREMAQITON



In [44]:
from seamless_communication.inference import Translator
import inspect

# Inspect the __init__ method of the Translator class to see its parameters
print(inspect.signature(Translator.__init__))


(self, model_name_or_card: Union[str, fairseq2.assets.card.AssetCard], vocoder_name_or_card: Union[str, fairseq2.assets.card.AssetCard, NoneType], device: torch.device, text_tokenizer: Optional[fairseq2.data.text.text_tokenizer.TextTokenizer] = None, apply_mintox: bool = False, dtype: torch.dtype = torch.float16, input_modality: Optional[seamless_communication.inference.translator.Modality] = None, output_modality: Optional[seamless_communication.inference.translator.Modality] = None)




# Test/Demo on Inference Model


In [101]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy
import soundfile
import torchaudio
import torch

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover

# Initialize a Translator object with the correct parameters
model_name = "seamlessM4T_v2_large"
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"

translator = Translator(
    model_name_or_card=model_name,
    vocoder_name_or_card=vocoder_name,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=torch.float16
)

# Define target languages
tgt_langs = ["arb"]

# List of test audio files (paths from your dataset)
test_audio_files = [
    "dataset_amazigh/wav/conv_wav/S1_conv_37.wav",
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav"
]

# Function to resample audio to 16kHz if necessary
def resample_audio(audio_path, target_sr=16000):
    audio, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(audio)
    return audio, target_sr

# Translate and display the results
for audio_file in test_audio_files:
    print(f"Audio file: {audio_file}")
    display(Audio(audio_file, rate=16000, autoplay=False, normalize=True))

    for tgt_lang in tgt_langs:
        # Resample audio if necessary
        audio, sr = resample_audio(audio_file)
        audio_file_resampled = audio_file.replace('.wav', '_16k.wav')
        torchaudio.save(audio_file_resampled, audio, sr)

        # Perform translation
        text_output, _ = translator.predict(
            input=audio_file_resampled,
            task_str="s2tt",
            tgt_lang=tgt_lang,
        )

        print(f"Translated text in {tgt_lang}: {text_output[0]}")
        print()

        # Optionally save the translated audio if needed
        # This part is a placeholder as it requires a TTS system to generate speech from text
        # out_file = f"/path/to/save/translated_{tgt_lang}_{audio_file.split('/')[-1]}"
        # torchaudio.save(out_file, translated_speech, sample_rate=16000)

        # Display the translated audio if available
        # audio_play = Audio(out_file, rate=16000, autoplay=False, normalize=True)
        # display(audio_play)
        print()


Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


Audio file: dataset_amazigh/wav/conv_wav/S1_conv_37.wav


Translated text in arb: استثنيت من المليئة لأنني لم أجد ثنيان


Audio file: dataset_amazigh/wav/rescue_wav/S1_resc_2.wav


Translated text in arb: لست بحاجة لتعاونك.




In [75]:
pip install rouge_score nltk


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24956 sha256=d19101657eee636e02a0ddb299a861f05bdbc4d492e962ada156c25dace89390
  Stored in directory: /home/user/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge_score
Installing collected packages: nltk, rouge_score
Successfully installed nltk-3.8.1 rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [94]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import torchaudio
import torch

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover
import evaluate

# Initialize the Translator object with the correct parameters
translator = Translator(
    model_name_or_card="seamlessM4T_v2_large",  # Use an appropriate model name if needed
    vocoder_name_or_card="vocoder_36langs",  # Assuming this is the correct vocoder for the model
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=torch.float16
)

# Define target languages
tgt_langs = ["arb"]

# Load evaluation metrics using the `evaluate` library
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
ter_metric = evaluate.load("ter")

# List of test audio files 
test_audio_files = [
    "dataset_amazigh/wav/conv_wav/S3_conv_1.wav",
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav"
]

# Ground truth translations ( actual translations from CSV)
ground_truth_translations = {
    "dataset_amazigh/wav/conv_wav/S3_conv_1.wav": "هل تحتاج إلى مساعدة؟",  
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav": "نحن هنا للمساعدة"  
}

# Function to resample audio to 16kHz if necessary
def resample_audio(audio_path, target_sr=16000):
    audio, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(audio)
    return audio, target_sr

# Function to perform direct Amazigh-to-Arabic translation using the Translator
def translate_audio(audio_path, tgt_lang):
    # Resample audio if necessary
    audio, sr = resample_audio(audio_path)
    audio_file_resampled = audio_path.replace('.wav', '_16k.wav')
    torchaudio.save(audio_file_resampled, audio, sr)
    
    # Perform translation
    text_output, _ = translator.predict(
        input=audio_file_resampled,
        task_str="s2tt",  # speech-to-text-to-text translation
        tgt_lang=tgt_lang
    )
    return text_output[0]

# Translate and display the results
for audio_file in test_audio_files:
    for tgt_lang in tgt_langs:
        # Perform translation
        translation = translate_audio(audio_file, tgt_lang)
        
        # Ensure translation is a string and not CString
        translation = str(translation)

        # Compute and print evaluation metrics using ground truth translations
        label_str = ground_truth_translations[audio_file]  # Retrieve the actual reference text
        
        # Compute evaluation metrics
        wer = wer_metric.compute(predictions=[translation], references=[label_str])
        cer = cer_metric.compute(predictions=[translation], references=[label_str])
        bleu = bleu_metric.compute(predictions=[translation], references=[[label_str]])
        rouge = rouge_metric.compute(predictions=[translation], references=[label_str])
        meteor = meteor_metric.compute(predictions=[translation], references=[label_str])
        ter = ter_metric.compute(predictions=[translation], references=[label_str])
        
        # Print results
        print(f"Audio file: {audio_file}")
        print(f"Translated text in {tgt_lang}: {translation}")
        print(f"WER: {wer:.4f}")
        print(f"CER: {cer:.4f}")
        print(f"BLEU: {bleu['bleu']:.4f}")
       
        print()


Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_36langs. Set `force` to `True` to download again.
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
2024-05-19 06:54:28,117 INFO -- absl: Using default tokenizer.
2024-05-19 06:54:28,431 INFO -- absl: Using default tokenizer.


Audio file: dataset_amazigh/wav/conv_wav/S3_conv_1.wav
Translated text in arb: السلام عليكم من كان تيجيت؟
WER: 1.2500
CER: 1.0000
BLEU: 0.0000

Audio file: dataset_amazigh/wav/rescue_wav/S1_resc_2.wav
Translated text in arb: لست بحاجة لتعاونك.
WER: 1.0000
CER: 0.8750
BLEU: 0.0000



In [92]:
import torchaudio
import torch
from IPython.display import Audio, display
from seamless_communication.inference import Translator
import evaluate

# Initialize the Translator object with the correct parameters
translator = Translator(
    model_name_or_card="seamlessM4T_v2_large",  # Use an appropriate model name if needed
    vocoder_name_or_card="vocoder_36langs",  # Assuming this is the correct vocoder for the model
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=torch.float16
)

# Define target language
tgt_lang = "arb"

# Load WER evaluation metric using the `evaluate` library
wer_metric = evaluate.load("wer")

# List of test audio files (paths from your dataset)
test_audio_files = [
    "dataset_amazigh/wav/conv_wav/S3_conv_1.wav",
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav"
]

# Ground truth translations (replace these with actual translations from your CSV)
ground_truth_translations = {
    "dataset_amazigh/wav/conv_wav/S3_conv_1.wav": "هل تحتاج إلى مساعدة؟",  # Replace with actual Arabic translation
    "dataset_amazigh/wav/rescue_wav/S1_resc_2.wav": "نحن هنا للمساعدة"  # Replace with actual Arabic translation
}

# Function to resample audio to 16kHz if necessary
def resample_audio(audio_path, target_sr=16000):
    audio, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(audio)
    return audio, target_sr

# Function to perform direct Amazigh-to-Arabic translation using the Translator
def translate_audio(audio_path, tgt_lang):
    # Resample audio if necessary
    audio, sr = resample_audio(audio_path)
    audio_file_resampled = audio_path.replace('.wav', '_16k.wav')
    torchaudio.save(audio_file_resampled, audio, sr)
    
    # Perform translation
    text_output, _ = translator.predict(
        input=audio_file_resampled,
        task_str="s2tt",  # speech-to-text-to-text translation
        tgt_lang=tgt_lang
    )
    return str(text_output[0])  # Ensure translation is a string

# Translate and display the results
for audio_file in test_audio_files:
    # Perform translation
    translation = translate_audio(audio_file, tgt_lang)
    
    # Compute and print WER using ground truth translations
    label_str = ground_truth_translations[audio_file]  # Retrieve the actual reference text
    wer = wer_metric.compute(predictions=[translation], references=[label_str])
    
    # Print results
    print(f"Audio file: {audio_file}")
    print(f"Translated text in {tgt_lang}: {translation}")
    print(f"WER: {wer:.4f}")
    print()


Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_36langs. Set `force` to `True` to download again.


Audio file: dataset_amazigh/wav/conv_wav/S3_conv_1.wav
Translated text in arb: السلام عليكم من كان تيجيت؟
WER: 1.2500

Audio file: dataset_amazigh/wav/rescue_wav/S1_resc_2.wav
Translated text in arb: لست بحاجة لتعاونك.
WER: 1.0000

