In [1]:
from pyannote.audio import Pipeline
import  torch

In [15]:
from pyannote.audio.pipelines.speaker_diarization import *

In [16]:
class SpeakerDiarization(SpeakerDiarizationMixin, Pipeline):
    def __init__(
        self,
        segmentation: PipelineModel = "pyannote/segmentation@2022.07",
        segmentation_step: float = 0.1,
        embedding: PipelineModel = "speechbrain/spkrec-ecapa-voxceleb@5c0be3875fda05e81f3c004ed8c7c06be308de1e",
        embedding_exclude_overlap: bool = False,
        clustering: str = "AgglomerativeClustering",
        embedding_batch_size: int = 1,
        segmentation_batch_size: int = 1,
        der_variant: Optional[dict] = None,
        use_auth_token: Union[Text, None] = None,
    ):
        super().__init__()

        self.segmentation_model = segmentation
        model: Model = get_model(segmentation, use_auth_token=use_auth_token)

        self.segmentation_step = segmentation_step

        self.embedding = embedding
        self.embedding_batch_size = embedding_batch_size
        self.embedding_exclude_overlap = embedding_exclude_overlap

        self.klustering = clustering

        self.der_variant = der_variant or {"collar": 0.0, "skip_overlap": False}

        segmentation_duration = model.specifications.duration
        self._segmentation = Inference(
            model,
            duration=segmentation_duration,
            step=self.segmentation_step * segmentation_duration,
            skip_aggregation=True,
            batch_size=segmentation_batch_size,
        )

        if self._segmentation.model.specifications.powerset:
            self.segmentation = ParamDict(
                min_duration_off=Uniform(0.0, 1.0),
            )

        else:
            self.segmentation = ParamDict(
                threshold=Uniform(0.1, 0.9),
                min_duration_off=Uniform(0.0, 1.0),
            )

        if self.klustering == "OracleClustering":
            metric = "not_applicable"

        else:
            self._embedding = PretrainedSpeakerEmbedding(
                self.embedding, use_auth_token=use_auth_token
            )
            self._audio = Audio(sample_rate=self._embedding.sample_rate, mono="downmix")
            metric = self._embedding.metric

        try:
            Klustering = Clustering[clustering]
        except KeyError:
            raise ValueError(
                f'clustering must be one of [{", ".join(list(Clustering.__members__))}]'
            )
        self.clustering = Klustering.value(metric=metric)

    @property
    def segmentation_batch_size(self) -> int:
        return self._segmentation.batch_size

    @segmentation_batch_size.setter
    def segmentation_batch_size(self, batch_size: int):
        self._segmentation.batch_size = batch_size

    def get_segmentations(self, file, hook=None) -> SlidingWindowFeature:
        """Apply segmentation model"""
        if hook is not None:
            hook = functools.partial(hook, "segmentation", None)

        segmentations: SlidingWindowFeature = self._segmentation(file, hook=hook)
        return segmentations

    def get_embeddings(
        self,
        file,
        binary_segmentations: SlidingWindowFeature,
        exclude_overlap: bool = False,
        hook: Optional[Callable] = None,
    ):
        """Extract embeddings for each (chunk, speaker) pair"""
        duration = binary_segmentations.sliding_window.duration
        num_chunks, num_frames, num_speakers = binary_segmentations.data.shape

        if exclude_overlap:
            min_num_samples = self._embedding.min_num_samples
            num_samples = duration * self._embedding.sample_rate
            min_num_frames = math.ceil(num_frames * min_num_samples / num_samples)
            clean_frames = 1.0 * (
                np.sum(binary_segmentations.data, axis=2, keepdims=True) < 2
            )
            clean_segmentations = SlidingWindowFeature(
                binary_segmentations.data * clean_frames,
                binary_segmentations.sliding_window,
            )
        else:
            min_num_frames = -1
            clean_segmentations = SlidingWindowFeature(
                binary_segmentations.data, binary_segmentations.sliding_window
            )

        def iter_waveform_and_mask():
            for (chunk, masks), (_, clean_masks) in zip(
                binary_segmentations, clean_segmentations
            ):
                waveform, _ = self._audio.crop(
                    file,
                    chunk,
                    duration=duration,
                    mode="pad",
                )
                masks = np.nan_to_num(masks, nan=0.0).astype(np.float32)
                clean_masks = np.nan_to_num(clean_masks, nan=0.0).astype(np.float32)

                for mask, clean_mask in zip(masks.T, clean_masks.T):
                    if np.sum(clean_mask) > min_num_frames:
                        used_mask = clean_mask
                    else:
                        used_mask = mask
                    yield waveform[None], torch.from_numpy(used_mask)[None]

        batches = batchify(
            iter_waveform_and_mask(),
            batch_size=self.embedding_batch_size,
            fillvalue=(None, None),
        )

        batch_count = math.ceil(num_chunks * num_speakers / self.embedding_batch_size)
        embedding_batches = []

        if hook is not None:
            hook("embeddings", None, total=batch_count, completed=0)

        for i, batch in enumerate(batches, 1):
            waveforms, masks = zip(*filter(lambda b: b[0] is not None, batch))
            waveform_batch = torch.vstack(waveforms)
            mask_batch = torch.vstack(masks)

            embedding_batch: np.ndarray = self._embedding(
                waveform_batch, masks=mask_batch
            )
            embedding_batches.append(embedding_batch)

            if hook is not None:
                hook("embeddings", embedding_batch, total=batch_count, completed=i)

        embedding_batches = np.vstack(embedding_batches)
        embeddings = rearrange(embedding_batches, "(c s) d -> c s d", c=num_chunks)
        return embeddings

    def cluster_embeddings(
        self,
        embeddings: np.ndarray,
        binarized_segmentations: SlidingWindowFeature,
        num_speakers: Optional[int] = None,
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None,
        file: Optional[AudioFile] = None,
    ):
        """Cluster embeddings"""
        hard_clusters, _, centroids = self.clustering(
            embeddings=embeddings,
            segmentations=binarized_segmentations,
            num_clusters=num_speakers,
            min_clusters=min_speakers,
            max_clusters=max_speakers,
            file=file,
            frames=self._segmentation.model.receptive_field,
        )
        return hard_clusters, centroids

    def reconstruct_diarization(
        self,
        segmentations: SlidingWindowFeature,
        hard_clusters: np.ndarray,
        count: SlidingWindowFeature,
    ) -> SlidingWindowFeature:
        """Reconstruct diarization from clustered segmentation"""
        num_chunks, num_frames, local_num_speakers = segmentations.data.shape
        num_clusters = np.max(hard_clusters) + 1
        clustered_segmentations = np.NAN * np.zeros(
            (num_chunks, num_frames, num_clusters)
        )

        for c, (cluster, (chunk, segmentation)) in enumerate(
            zip(hard_clusters, segmentations)
        ):
            for k in np.unique(cluster):
                if k == -2:
                    continue
                clustered_segmentations[c, :, k] = np.max(
                    segmentation[:, cluster == k], axis=1
                )

        clustered_segmentations = SlidingWindowFeature(
            clustered_segmentations, segmentations.sliding_window
        )
        return self.to_diarization(clustered_segmentations, count)

    def to_annotation(
        self,
        discrete_diarization: SlidingWindowFeature,
        file: AudioFile,
    ) -> Annotation:
        """Convert discrete diarization to annotation"""
        diarization = self.to_annotation(
            discrete_diarization,
            min_duration_on=0.0,
            min_duration_off=self.segmentation.min_duration_off,
        )
        diarization.uri = file["uri"]
        return diarization

    def apply_segmentation(self, file: AudioFile, hook: Optional[Callable] = None):
        return self.get_segmentations(file, hook)

    def apply_embedding(
        self,
        file: AudioFile,
        binarized_segmentations: SlidingWindowFeature,
        hook: Optional[Callable] = None,
    ):
        return self.get_embeddings(
            file,
            binarized_segmentations,
            exclude_overlap=self.embedding_exclude_overlap,
            hook=hook,
        )

    def apply_clustering(
        self,
        embeddings: np.ndarray,
        binarized_segmentations: SlidingWindowFeature,
        num_speakers: Optional[int] = None,
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None,
        file: Optional[AudioFile] = None,
    ):
        return self.cluster_embeddings(
            embeddings,
            binarized_segmentations,
            num_speakers=num_speakers,
            min_speakers=min_speakers,
            max_speakers=max_speakers,
            file=file,
        )

    def apply_reconstruction(
        self,
        segmentations: SlidingWindowFeature,
        hard_clusters: np.ndarray,
        count: SlidingWindowFeature,
    ):
        return self.reconstruct_diarization(segmentations, hard_clusters, count)

    def apply_annotation(
        self,
        discrete_diarization: SlidingWindowFeature,
        file: AudioFile,
    ):
        return self.to_annotation(discrete_diarization, file)


In [17]:
from pathlib import Path
import random

def ls(self):
    """
    List the contents of the directory.
    Returns:
        List[Path]: A list of Paths in the directory.
    """
    if not self.is_dir():
        raise NotADirectoryError(f"{self} is not a directory")
    
    return list(self.iterdir())
Path.ls = ls

from app.services.audio.audio import AudioSlicer
import io


In [18]:
audio_path = Path('/home/dima/ssd/1/audio')
path = audio_path.ls()[7]
#path = audio_path/'test.webm'


In [19]:
n=0

In [20]:

audio_slicer = await AudioSlicer.from_ffmpeg_slice(path, n, 2)
slice_duration = audio_slicer.audio.duration_seconds
audio_data = await audio_slicer.export_data()

In [21]:
audio_slicer.audio

In [22]:
slice_duration

2.04

In [25]:
pipeline = SpeakerDiarization()

# Step 1: Apply segmentation to a chunk of audio
segmentations = pipeline.apply_segmentation(audio_data)

# Step 2: Binarize segmentations
binarized_segmentations = binarize(segmentations, onset=0.5, initial_state=False)

# Step 3: Estimate the number of speakers
count = pipeline.speaker_count(binarized_segmentations, pipeline._segmentation.model.receptive_field)

# Step 4: Extract embeddings from the binarized segmentations
embeddings = pipeline.apply_embedding(file_chunk, binarized_segmentations)

# Step 5: Apply clustering to the embeddings
hard_clusters, centroids = pipeline.apply_clustering(embeddings, binarized_segmentations, num_speakers=2)

# Step 6: Reconstruct the diarization from the clustered segmentations
discrete_diarization = pipeline.apply_reconstruction(segmentations, hard_clusters, count)

# Step 7: Convert the discrete diarization to an annotation
annotation = pipeline.apply_annotation(discrete_diarization, file_chunk)

# The `annotation` now contains the speaker diarization result for the audio chunk


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.2.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.


ImportError: 'speechbrain' must be installed to use 'speechbrain/spkrec-ecapa-voxceleb@5c0be3875fda05e81f3c004ed8c7c06be308de1e' embeddings. Visit https://speechbrain.github.io for installation instructions.