In [7]:
import torch
import torchaudio
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import logging
import os

In [2]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
logger.info("Loading pre-trained Wav2Vec2 processor and model...")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

2024-10-30 01:04:21,528 - INFO - Loading pre-trained Wav2Vec2 processor and model...
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
logger.info(f"Using device: {device}")


2024-10-30 01:04:28,319 - INFO - Using device: cuda


In [6]:
data = df = pd.read_csv("/blue/ufdatastudios/ahmed.waseem/ctc/meta_speaker.csv")

In [None]:
data = df = pd.read_csv("/blue/ufdatastudios/ahmed.waseem/ctc/meta_speaker.csv")
data = data[data['duration'].apply(lambda x: x >= 5)]
audio_folder = "/blue/ufdatastudios/ahmed.waseem/processed_audio"
data["audio_filepath"] = data["audio_filepath"].apply(lambda x: os.path.join(audio_folder, x))
data.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["audio_filepath"] = data["audio_filepath"].apply(lambda x: os.path.join(audio_folder, x))


In [9]:
def segment_audio_ctc(row):
    try:
        logger.info(f"Processing file: {row['audio_filepath']}")
        waveform, sample_rate = torchaudio.load(row['audio_filepath'])

        if sample_rate != 16000:
            logger.info(f"Resampling audio from {sample_rate}Hz to 16000Hz")
            waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            sample_rate = 16000

        input_values = processor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt").input_values
        input_values = input_values.to(device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])

        words = transcription.split()
        total_duration = row['duration']
        word_duration = total_duration / len(words)

        word_segments = []
        for idx, word in enumerate(words):
            start_time = idx * word_duration
            end_time = start_time + word_duration
            word_segments.append((word, start_time, end_time))

        return word_segments
    except Exception as e:
        logger.error(f"Error processing file {row['audio_filepath']}: {e}")
        return None


In [10]:
logger.info("Applying segmentation to each row in the dataset...")
data['word_segments'] = data.apply(segment_audio_ctc, axis=1)

2024-10-30 01:06:41,992 - INFO - Applying segmentation to each row in the dataset...
2024-10-30 01:06:41,994 - INFO - Processing file: /blue/ufdatastudios/ahmed.waseem/processed_audio/audio/281474976886709_f3213_chunk_5.wav
2024-10-30 01:06:42,975 - INFO - Processing file: /blue/ufdatastudios/ahmed.waseem/processed_audio/audio/281474976886709_f3213_chunk_13.wav
2024-10-30 01:06:42,992 - INFO - Processing file: /blue/ufdatastudios/ahmed.waseem/processed_audio/audio/281474976884126_f3274_chunk_3.wav
2024-10-30 01:06:43,008 - INFO - Processing file: /blue/ufdatastudios/ahmed.waseem/processed_audio/audio/281474976887171_f3278_chunk_6.wav
2024-10-30 01:06:43,024 - INFO - Processing file: /blue/ufdatastudios/ahmed.waseem/processed_audio/audio/281474976884085_f3239_chunk_0.wav
2024-10-30 01:06:43,045 - INFO - Processing file: /blue/ufdatastudios/ahmed.waseem/processed_audio/audio/281474976894273_f3279_chunk_12.wav
2024-10-30 01:06:43,061 - INFO - Processing file: /blue/ufdatastudios/ahmed.was

In [11]:
logger.info("Segmentation completed. Saving the result to CSV...")
data[['audio_filepath', 'text', 'word_segments']].to_csv('segmented_audio_results.csv', index=False)
logger.info("Results saved to segmented_audio_results.csv")


2024-10-30 01:07:40,270 - INFO - Segmentation completed. Saving the result to CSV...
2024-10-30 01:07:40,417 - INFO - Results saved to segmented_audio_results.csv
