<a href="https://colab.research.google.com/github/Shimmer0523/voice-diarizer/blob/main/voice_diarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pyannote.audio pydub noisereduce

In [None]:
import os
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook
from pyannote.core import Segment, Annotation
from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
import numpy as np
from pydub import AudioSegment
import noisereduce as nr
import soundfile as sf
from moviepy.editor import VideoFileClip
import torch
import torchaudio
from google.colab import userdata

In [None]:
def mp4_to_audio(mp4_file):
    """mp4 to audio

    Args:
        mp4_file (string): file path of mp4 file
        output_audio (number[]): extracted audio data
    """
    video = VideoFileClip(mp4_file)
    audio = video.audio
    file_body = os.path.splitext(os.path.basename(mp4_file))[0]
    audio_file = "audio_" + file_body + ".wav"
    print("export: " + audio_file)
    audio.write_audiofile(audio_file, codec="pcm_s16le")
    audio.close()
    video.close()

    return audio_file


class Section:
    def __init__(self, section_start, section_end, noise_start, noise_end) -> None:
        self.section_start = section_start
        self.section_end = section_end
        self.noise_start = noise_start
        self.noise_end = noise_end


def reduce_noise(waveform: np.ndarray, sample_rate, sections: list[Section]) -> torch.Tensor:
    output = waveform

    for s in sections:
        y = waveform[s.section_start * sample_rate : s.section_end * sample_rate]
        y_noise = waveform[s.noise_start * sample_rate : s.noise_end * sample_rate]

        output = np.concatenate((
            waveform[: s.section_start],
            nr.reduce_noise(y=y, y_noise=y_noise, sr=sample_rate, stationary=True),
            waveform[s.section_end :]
        ))
    return torch.from_numpy(output)


In [None]:
MODE = "AUDIO" # AUDIO or MP4
AUDIO_FILE = "g22.wav"
SECTIONS = [Section(section_start=0, section_end=-0.1, noise_start=0, noise_end=0.1)]

In [None]:
# MP4_FILE = "bb02.mb4"
# SECTIONS = [[
#         Section(section_start=0, section_end=35, noise_start=1, noise_end=3),
#         Section(section_start=35, section_end=-1, noise_start=69, noise_end=70),
#     ]]


In [None]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1", use_auth_token=userdata.get('HUGGINGFACE_TOKEN')
)
pipeline.to(torch.device("cuda"))

if MODE == "MP4":
    audio_file = mp4_to_audio(MP4_FILE)
else:
    audio_file = AUDIO_FILE

audio = AudioSegment.from_wav(audio_file).set_channels(1)
waveform = np.array(audio.get_array_of_samples())
sampling_rate = audio.frame_rate

denoised_waveform = reduce_noise(
    waveform,
    sampling_rate,
    sections=SECTIONS,
)

with ProgressHook() as hook:
    diarization: Annotation = pipeline(
        {"waveform": denoised_waveform, "sample_rate": sampling_rate}, num_speakers=2, hook=hook
    )

for segment, track_name, label in diarization.itertracks(yield_label=True):
    print(f"{segment.start=:.1f}, {segment.end=:.1f}, {track_name=}, {label=}")

remaining_speakers = ["SPEAKER_00", "SPEAKER_01"]

for speaker in remaining_speakers:
    print("open: " + audio_file)
    audio = AudioSegment.from_wav(audio_file)

    for segment, track_name, label in diarization.itertracks(yield_label=True):
        if label != speaker:
            mute_start = segment.start * 1000
            mute_end = segment.end * 1000
            muted_section = AudioSegment.silent(duration=(mute_end - mute_start), frame_rate=sampling_rate)
            audio = audio[:mute_start] + muted_section + audio[mute_end:]
            print(f"mute: {mute_start} - {mute_end}")

    export_file = "muted_" + speaker + "_" + os.path.basename(audio_file)

    print("export: " + export_file)
    audio.export(export_file, format="wav")
