In [None]:
# @title Install Dependencies
%%capture
# !pip install git+https://github.com/huggingface/transformers
!pip install transformers
!pip install pyctcdecode
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install accelerate
!pip install datasets
!pip install pyannote.audio

In [None]:
import os
import glob
import pandas as pd

import torch
import librosa

from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import AutoProcessor, AutoModelForCTC, AutoFeatureExtractor
from transformers import Wav2Vec2ProcessorWithLM, pipeline

from datasets import load_dataset, Audio, Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
huggingface_dir = os.path.expanduser("~/.huggingface/")
os.makedirs(huggingface_dir, exist_ok=True)

token_path = os.path.join(huggingface_dir, "token")
with open(token_path, "w") as f:
    f.write("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")


# **With Adapters**

In [None]:
import time
def transcribe_audio(input_file,
                     target_lang,
                     device,
                     model_id="Sunbird/sunbird-mms",
                     chunk_length_s=10,
                     stride_length_s=(4, 2),
                     return_timestamps="word"):
    """
    Transcribes audio from the input file using sunbird asr model.

    Args:
        input_file (str): Path to the audio file for transcription.
        target_lang (str): Target language for transcription.
            'ach' - Acholi
            'lug' - Luganda
            'teo' - Ateso
            'lgg' - Lugbara
        device (str or torch.device): Device for running the model (e.g., 'cpu', 'cuda').
        model_id (str, optional): ID of the asr model. Defaults to "Sunbird/sunbird-mms".
        chunk_length_s (int, optional): Length of audio chunks in seconds. Defaults to 5.

    Returns:
        dict: A dictionary containing the transcription result.
            Example: {'text': 'Transcribed text here.'}
    """


    pipe = pipeline(model=model_id, device=device)
    pipe.tokenizer.set_target_lang(target_lang)
    pipe.model.load_adapter(target_lang)

    output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s,return_timestamps="word")
    return output

In [None]:
input_file_path = "/content/poll9_30_sec.wav"
target_language = "lug"
transcription_result = transcribe_audio(input_file_path, target_language, device)
transcription_result

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

{'text': 'a kukaako ane amanya neddoboozi gimi ya musasenkanga nga nze kyapesoni wa disitulict mumens cansel e ruweero zisanyisa okulaba mu kyala gimeya webale kugamikwano gyaffe abatruk effe mbagenda maaso nokunoonyereza ku zimu ku nsonga ezikosa obulamu bwaffe ebyenfuna baffe nebirala ebisinga kw ebyo omulungi ogwo tubadde nekibuuzo ekibadde kitambula ngekibuuzo kiri nti twagala okumanya eyo mu kitundu kyo kiki kyolowooza ekivuddeko abaana bano abali wakati wemyaka kumi esatu ne kumi emwenda okufuna embuto ensonga ezo ziruddemu nnyo ate era ozitegeerera ddala bulungi nnyo nkakasa bwe tutandika okwogera ku baana okufuna embuto mu kitundu kyo gwe nga akola ku nsonga zabakyala oteeka okuba nga ozirabye oziwulidde ozikozeeko awuliriza yali agadde okumanyaa nga bwe mbagambye envudde luweero era omulama gwe tulina olwaleeroyinza okuba sikinnyonnyola bulungi ngomukugueeza omwana atanetuuka nolubutoemyaka gye tusinze okulaba okuviira ddala ku myaka ogambye kumi nesatukmeskdnabakkumi nogumu k

# **Diarization**

In [None]:
import numpy as np
import pandas as pd
from pyannote.audio import Pipeline
from typing import Optional, Union
import torch
import numpy as np

# from .audio import load_audio, SAMPLE_RATE

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


In [None]:
device

device(type='cuda')

In [None]:
SAMPLE_RATE = 16000

def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
    """
    Open an audio file and read as mono waveform, resampling as necessary.

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
        # librosa automatically resamples to the given sample rate (if necessary)
        # and converts the signal to mono (by averaging channels)
        audio, _ = librosa.load(file, sr=sr, mono=True, dtype=np.float32)
    except Exception as e:
        raise RuntimeError(f"Failed to load audio with librosa: {e}") from e

    return audio

In [None]:
class DiarizationPipeline:
    """
    A pipeline for performing speaker diarization on audio data.

    This class initializes with a pretrained diarization model and can be called
    with an audio file or waveform to perform diarization, returning a DataFrame
    with the start and end times for each speaker segment.

    Attributes:
        model (Pipeline): The loaded diarization model ready for inference.
    """

    def __init__(
        self,
        model_name="pyannote/speaker-diarization-3.0",
        use_auth_token=None,
        device: Optional[Union[str, torch.device]] = "cpu",
    ):
        """
        Initializes the DiarizationPipeline with a pretrained model.

        Args:
            model_name (str): The name of the pretrained diarization model to load.
            use_auth_token (str, optional): Token to use for authentication if the model
                                            is from a private repository. Defaults to None.
            device (str or torch.device, optional): The device on which to run the model,
                                                    either "cpu" or "cuda". Defaults to "cpu".
        """
        if isinstance(device, str):
            device = torch.device(device)
        self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token).to(device)

    def __call__(
        self,
        audio: Union[str, np.ndarray],
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None
    ) -> pd.DataFrame:
        """
        Perform diarization on the provided audio.

        Args:
            audio (str or np.ndarray): The path to the audio file or a numpy array of the waveform.
            min_speakers (int, optional): The minimum number of speakers to assume in the diarization
                                          process. Defaults to None.
            max_speakers (int, optional): The maximum number of speakers to assume in the diarization
                                          process. Defaults to None.

        Returns:
            DataFrame: A pandas DataFrame with columns for the segment, label, speaker,
                       start time, and end time of each speaker segment.
        """
        if isinstance(audio, str):
            audio = load_audio(audio)
        audio_data = {
            'waveform': torch.from_numpy(audio[None, :]),
            'sample_rate': SAMPLE_RATE
        }
        segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
        return diarize_df

In [None]:

def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
    """
    Assign speakers to segments of a transcript based on the results of a diarization data frame.

    This function iterates through segments of a transcript and assigns the speaker labels
    based on the overlap between the speech segments and the diarization data. Optionally,
    if `fill_nearest` is True, the function will assign speakers even to segments that
    do not have a direct overlap with the diarization data by finding the closest speaker
    in time.

    Parameters:
    - diarize_df (DataFrame): A pandas DataFrame containing the diarization information
                              with columns 'start', 'end', and 'speaker'.
    - transcript_result (dict): A dictionary with a key 'chunks' that contains a list of
                                transcript segments, where each segment is a dictionary
                                with keys 'text' and 'timestamp' (a tuple with start and end times).
    - fill_nearest (bool, optional): A flag to determine whether to assign speakers to all segments
                                     based on the nearest speaker data if no direct overlap is found.
                                     Defaults to False.

    Returns:
    - dict: The updated transcript_result with speakers assigned to each segment.

    Examples of diarize_df and transcript_result structures:

    diarize_df example:
        speaker  start   end
        0        0.0     1.5
        1        1.5     3.0

    transcript_result example:
        {'chunks': [{'text': 'Hello', 'timestamp': (0.5, 1.0)},
                    {'text': 'world', 'timestamp': (1.5, 2.0)}]}

    Example usage:
    >>> diarize_df = pd.DataFrame({'speaker': [0, 1], 'start': [0.0, 1.5], 'end': [1.5, 3.0]})
    >>> transcript_result = {'chunks': [{'text': 'Hello', 'timestamp': (0.5, 1.0)},
                                         {'text': 'world', 'timestamp': (1.5, 2.0)}]}
    >>> assign_word_speakers(diarize_df, transcript_result)
    {'chunks': [{'text': 'Hello', 'timestamp': (0.5, 1.0), 'speaker': 0},
                {'text': 'world', 'timestamp': (1.5, 2.0), 'speaker': 1}]}
    """
    transcript_segments = transcript_result["chunks"]

    for seg in transcript_segments:
        # Calculate intersection and union between diarization segments and transcript segment
        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg["timestamp"][1]) - np.maximum(diarize_df['start'], seg["timestamp"][0])
        diarize_df['union'] = np.maximum(diarize_df['end'], seg["timestamp"][1]) - np.minimum(diarize_df['start'], seg["timestamp"][0])

        # Filter out diarization segments with no overlap if fill_nearest is False
        if not fill_nearest:
            dia_tmp = diarize_df[diarize_df['intersection'] > 0]
        else:
            dia_tmp = diarize_df

        # If there are overlapping segments, assign the speaker with the greatest overlap
        if len(dia_tmp) > 0:
            speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
            seg["speaker"] = speaker

    return transcript_result


In [None]:
class Segment:
    """
    A class to represent a single segment of audio with a start time, end time, and speaker label.

    This class is typically used to encapsulate the information about a segment of audio that
    has been identified during a speaker diarization process, including the time the segment
    starts, when it ends, and which speaker is speaking.

    Attributes:
        start (float): The start time of the audio segment in seconds.
        end (float): The end time of the audio segment in seconds.
        speaker (str, optional): The label of the speaker for this audio segment. Defaults to None.
    """

    def __init__(self, start, end, speaker=None):
        """
        Initializes a new instance of the Segment class.

        Args:
            start (float): The start time of the audio segment in seconds.
            end (float): The end time of the audio segment in seconds.
            speaker (str, optional): The label of the speaker for this segment. If not specified,
                                     the speaker attribute is set to None.
        """
        self.start = start
        self.end = end
        self.speaker = speaker

In [None]:
transcription_result
results = []
tmp_results = transcription_result
hf_token="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"
diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)

In [None]:
transcription_result["text"]

'a kukaako ane amanya neddoboozi gimi ya musasenkanga nga nze kyapesoni wa disitulict mumens cansel e ruweero zisanyisa okulaba mu kyala gimeya webale kugamikwano gyaffe abatruk effe mbagenda maaso nokunoonyereza ku zimu ku nsonga ezikosa obulamu bwaffe ebyenfuna baffe nebirala ebisinga kw ebyo omulungi ogwo tubadde nekibuuzo ekibadde kitambula ngekibuuzo kiri nti twagala okumanya eyo mu kitundu kyo kiki kyolowooza ekivuddeko abaana bano abali wakati wemyaka kumi esatu ne kumi emwenda okufuna embuto ensonga ezo ziruddemu nnyo ate era ozitegeerera ddala bulungi nnyo nkakasa bwe tutandika okwogera ku baana okufuna embuto mu kitundu kyo gwe nga akola ku nsonga zabakyala oteeka okuba nga ozirabye oziwulidde ozikozeeko awuliriza yali agadde okumanyaa nga bwe mbagambye envudde luweero era omulama gwe tulina olwaleeroyinza okuba sikinnyonnyola bulungi ngomukugueeza omwana atanetuuka nolubutoemyaka gye tusinze okulaba okuviira ddala ku myaka ogambye kumi nesatukmeskdnabakkumi nogumu kkumi nebi

In [None]:
diarize_segments = diarize_model("/content/poll9_30_sec.wav", min_speakers=None, max_speakers=None)

In [None]:
diarize_segments

Unnamed: 0,segment,label,speaker,start,end
0,[ 00:00:00.008 --> 00:00:05.441],1,SPEAKER_01,0.008489,5.441426
1,[ 00:00:00.534 --> 00:00:01.146],0,SPEAKER_00,0.534805,1.14601
2,[ 00:00:02.758 --> 00:00:03.132],0,SPEAKER_00,2.758913,3.132428
3,[ 00:00:03.641 --> 00:00:03.794],0,SPEAKER_00,3.641766,3.794567
4,[ 00:00:03.930 --> 00:00:03.998],0,SPEAKER_00,3.93039,3.998302
5,[ 00:00:05.882 --> 00:00:07.886],1,SPEAKER_01,5.882852,7.886248
6,[ 00:00:08.327 --> 00:00:12.470],1,SPEAKER_01,8.327674,12.470289
7,[ 00:00:13.132 --> 00:00:16.290],0,SPEAKER_00,13.132428,16.290323
8,[ 00:00:16.663 --> 00:00:19.142],0,SPEAKER_00,16.663837,19.142615
9,[ 00:00:19.601 --> 00:00:35.271],0,SPEAKER_00,19.601019,35.271647


In [None]:
output = assign_word_speakers(diarize_segments, transcription_result)

In [None]:
output

{'text': 'a kukaako ane amanya neddoboozi gimi ya musasenkanga nga nze kyapesoni wa disitulict mumens cansel e ruweero zisanyisa okulaba mu kyala gimeya webale kugamikwano gyaffe abatruk effe mbagenda maaso nokunoonyereza ku zimu ku nsonga ezikosa obulamu bwaffe ebyenfuna baffe nebirala ebisinga kw ebyo omulungi ogwo tubadde nekibuuzo ekibadde kitambula ngekibuuzo kiri nti twagala okumanya eyo mu kitundu kyo kiki kyolowooza ekivuddeko abaana bano abali wakati wemyaka kumi esatu ne kumi emwenda okufuna embuto ensonga ezo ziruddemu nnyo ate era ozitegeerera ddala bulungi nnyo nkakasa bwe tutandika okwogera ku baana okufuna embuto mu kitundu kyo gwe nga akola ku nsonga zabakyala oteeka okuba nga ozirabye oziwulidde ozikozeeko awuliriza yali agadde okumanyaa nga bwe mbagambye envudde luweero era omulama gwe tulina olwaleeroyinza okuba sikinnyonnyola bulungi ngomukugueeza omwana atanetuuka nolubutoemyaka gye tusinze okulaba okuviira ddala ku myaka ogambye kumi nesatukmeskdnabakkumi nogumu k

In [None]:
# todo
# - Measure accuracy
# - M

In [None]:
# Transcribe Acholi
input_file_path = "/content/acholi_10_d2a408c7-0133-4924-acd0-bb9c568294b7 (1).ogg"
target_language = "ach"
transcription_result = transcribe_audio(input_file_path, target_language, device)
print(transcription_result)

TypeError: ignored

In [None]:
# Transcribe Ateso
input_file_path = "/content/ateso_1045_18a4b777-ad48-4761-b193-ca0996698ef0.ogg"
target_language = "teo"
transcription_result = transcribe_audio(input_file_path, target_language, device)
print(transcription_result)

{'text': 'abu ngesi oyanga apapulai nu mam adolitos nuda apotu ijaikisi egirio alipan ke na elouni'}


In [None]:
# Transcribe Ateso
input_file_path = "/content/luganda_1005_7.ogg"
target_language = "lug"
transcription_result = transcribe_audio(input_file_path, target_language, device)
print(transcription_result)

Downloading (…)pter.lug.safetensors:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

{'text': 'kati kifuuse kya bulijjo bannabyabufuzi okukuubagana ku bifo'}


---