In [1]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

In [2]:
import os
import json
import torch
import numpy as np
import torchaudio
from datasets import Dataset, Audio
from transformers import (
    SeamlessM4TTokenizer,
    SeamlessM4TProcessor,
    SeamlessM4TModel,
    Trainer,
    TrainingArguments
)
import evaluate
import logging
from dataclasses import dataclass
from typing import Dict, List, Union, Any
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import logging

# Configure logging to output to both console and a file named 'results'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),               # Console output
        logging.FileHandler('results', mode='w') # File output; change to 'results.txt' if desired
    ]
)

logger = logging.getLogger(__name__)

In [4]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

# Constants
MODEL_ID = "facebook/hf-seamless-m4t-medium"  # Use medium for faster training or large for better quality
DATA_PATH = 's2t_manifest.json'  # Path to your JSONL file
OUTPUT_DIR = "./seamless_m4t_finetuned"
BATCH_SIZE = 1  # Adjust based on your GPU memory
LEARNING_RATE = 5e-5
NUM_EPOCHS = 3
GRADIENT_ACCUMULATION_STEPS = 8
MAX_OUTPUT_LENGTH = 1024  # As per your requirement
SRC_LANG = "eng"
TGT_LANG = "hin"
SAMPLING_RATE = 16000

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

2025-03-23 00:00:15,391 - __main__ - INFO - Using device: cuda


In [5]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """Data collator for speech-to-text batches."""
    processor: Any
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Extract audio and text features
        audio_arrays = [feature["audio"]["array"] for feature in features]
        labels = [feature["hindi_text"] for feature in features]
        
        # Process inputs - passing audio arrays directly
        batch = self.processor(
            audios=audio_arrays,
            sampling_rate=SAMPLING_RATE,
            return_tensors="pt",
            padding=True,
            src_lang=SRC_LANG
        )
        
        # Process labels
        labels_batch = self.processor(
            text=labels,
            return_tensors="pt",
            padding=True,
            tgt_lang=TGT_LANG
        )
        
        # Replace padding token id with -100 for loss calculation
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )
        
        batch["labels"] = labels
        
        return batch

# Load and prepare the dataset
def prepare_dataset(jsonl_file):
    """Load and prepare the dataset from a JSONL file."""
    logger.info(f"Loading dataset from {jsonl_file}")
    
    # Load JSONL data
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    
    # Convert to format expected by datasets library
    dataset_dict = {
        "audio": [item["source"]["audio_local_path"] for item in data],
        "hindi_text": [item["target"]["text"] for item in data],
        "english_text": [item["source"]["text"] for item in data],
        "id": [item["source"]["id"] for item in data]
    }
    
    # Create dataset
    dataset = Dataset.from_dict(dataset_dict)
    
    # Load audio files
    dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
    
    # Split dataset
    dataset = dataset.train_test_split(test_size=0.1, seed=42)
    
    return dataset

In [6]:
# Set up decoder parameters for the model
def setup_decoder_params(model, tokenizer):
    """Set up decoder parameters for the model."""
    # Identify the target language token ID
    target_lang_token = f"__{TGT_LANG}__"
    target_lang_token_id = tokenizer.convert_tokens_to_ids(target_lang_token)
    
    # Create forced decoder ids
    forced_decoder_ids = [[0, target_lang_token_id]]
    
    # We're returning the forced_decoder_ids to use separately, not setting in model config
    return model, forced_decoder_ids

# Custom training class
class CustomSeq2SeqTrainer(Trainer):
    def __init__(self, forced_decoder_ids=None, processor=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.forced_decoder_ids = forced_decoder_ids
        self.processor = processor  # Store the processor

    def evaluate(self, *args, **kwargs):
        # Run regular evaluation
        metrics = super().evaluate(*args, **kwargs)
        
        # Add BLEU score calculation
        eval_dataloader = self.get_eval_dataloader()
        all_preds = []
        all_labels = []
        
        self.model.eval()
        for batch in tqdm(eval_dataloader, desc="Computing sacrebleu score"):
            batch = {k: v.to(self.args.device) for k, v in batch.items()}
            
            with torch.no_grad():
                # Generate predictions
                generated_tokens = self.model.generate(
                    input_features=batch["input_features"],
                    tgt_lang=TGT_LANG,
                    max_new_tokens=MAX_OUTPUT_LENGTH,
                    num_beams=1,
                    decoder_input_ids=self.forced_decoder_ids,
                    generate_speech=False,
                    return_dict_in_generate=False
                )
                
                # Get labels
                labels = batch["labels"]
            
            # Add predictions and labels to our lists
            all_preds.extend(generated_tokens.sequences.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        
        # Replace -100 in labels with pad token id
        all_labels_processed = [np.where(seq != -100, seq, self.processor.tokenizer.pad_token_id).astype(np.int64) 
                               for seq in all_labels]

        # Decode using processor
        decoded_preds = self.processor.batch_decode(all_preds, skip_special_tokens=True)
        decoded_labels = self.processor.batch_decode(all_labels_processed, skip_special_tokens=True)

        # Compute BLEU score
        metric = evaluate.load("sacrebleu")
        bleu_result = metric.compute(predictions=decoded_preds, 
                                    references=[[label] for label in decoded_labels])

        # Add BLEU score to metrics
        metrics.update({"eval_bleu": bleu_result["score"]})
        logger.info(f"sacrebleu score: {bleu_result['score']}")
        
        return metrics

In [7]:
# Main function
def main():
    """Main function for fine-tuning the model."""
    try:
        # Create output directory
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        
        # Load model and processor
        logger.info(f"Loading model and processor from {MODEL_ID}")
        tokenizer = SeamlessM4TTokenizer.from_pretrained(MODEL_ID)
        processor = SeamlessM4TProcessor.from_pretrained(
            MODEL_ID,
            src_lang=SRC_LANG,
            tgt_lang=TGT_LANG
        )
        model = SeamlessM4TModel.from_pretrained(MODEL_ID)
        
        # Set up decoder parameters
        model, forced_decoder_ids = setup_decoder_params(model, tokenizer)
        
        # Load dataset
        dataset = prepare_dataset(DATA_PATH)
        logger.info(f"Dataset loaded: {len(dataset['train'])} training samples, {len(dataset['test'])} test samples")
        
        # Create data collator
        data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
        
        # Configure training arguments
        logger.info("Setting up training arguments")
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            learning_rate=LEARNING_RATE,
            num_train_epochs=NUM_EPOCHS,
            fp16=True if device == "cuda" else False,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f"{OUTPUT_DIR}/logs",
            logging_steps=100,
            save_total_limit=2,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            metric_for_best_model="eval_bleu",
            greater_is_better=True,
            report_to="tensorboard",
            gradient_checkpointing=True
        )
        
        # Create custom trainer with generation capabilities
        logger.info("Creating trainer")
        trainer = CustomSeq2SeqTrainer(
            forced_decoder_ids=forced_decoder_ids,
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=data_collator,
            processor=processor,  # Pass the processor to the trainer
        )
        
        # Train model
        logger.info("Starting training")
        trainer.train()
        
        # Save model
        logger.info(f"Saving model to {OUTPUT_DIR}")
        trainer.save_model(OUTPUT_DIR)
        
        logger.info("Training complete!")
        
    except Exception as e:
        logger.error(f"Error during training: {str(e)}")
        raise

if __name__ == "__main__":
    main()


2025-03-23 00:00:15,429 - __main__ - INFO - Loading model and processor from facebook/hf-seamless-m4t-medium
2025-03-23 00:00:24,666 - __main__ - INFO - Loading dataset from s2t_manifest.json
2025-03-23 00:00:27,755 - __main__ - INFO - Dataset loaded: 63250 training samples, 7028 test samples
2025-03-23 00:00:27,756 - __main__ - INFO - Setting up training arguments
2025-03-23 00:00:27,854 - __main__ - INFO - Creating trainer
2025-03-23 00:00:29,499 - __main__ - INFO - Starting training
2025-03-23 00:00:59,085 - __main__ - ERROR - Error during training: CUDA out of memory. Tried to allocate 4.37 GiB. GPU 0 has a total capacity of 39.39 GiB of which 506.19 MiB is free. Process 10937 has 1.90 GiB memory in use. Process 11923 has 1.89 GiB memory in use. Process 57423 has 13.31 GiB memory in use. Including non-PyTorch memory, this process has 21.76 GiB memory in use. Of the allocated memory 20.01 GiB is allocated by PyTorch, and 1.24 GiB is reserved by PyTorch but unallocated. If reserved b

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.37 GiB. GPU 0 has a total capacity of 39.39 GiB of which 506.19 MiB is free. Process 10937 has 1.90 GiB memory in use. Process 11923 has 1.89 GiB memory in use. Process 57423 has 13.31 GiB memory in use. Including non-PyTorch memory, this process has 21.76 GiB memory in use. Of the allocated memory 20.01 GiB is allocated by PyTorch, and 1.24 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)