In [None]:
pip install transformers bitsandbytes accelerate librosa pydub pyannote.audio whisperx jiwer

In [None]:
!bash

In [None]:
cp /content/drive/MyDrive/whisper/alignment.py /usr/local/lib/python3.10/dist-packages/whisperx/alignment.py

In [None]:
pip install --target='/content/drive/MyDrive/whisper_model/env' transformers whisperx bitsandbytes pyannote.audio jiwer pydub accelerate

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/whisper_model/env')

In [8]:
import os
from pydub import AudioSegment
from faster_whisper import WhisperModel
import whisperx
from pyannote.audio import Pipeline
import torch
import numpy as np

def perform_vad_on_full_audio(input_audio, vad_pipeline, target_sampling_rate=16000):
    print("Performing VAD on full audio")
    audio = AudioSegment.from_wav(input_audio)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    vad_pipeline = vad_pipeline.to(device)

    # 오디오 세그맨트를 넘파이로 바꿔준다.
    samples = np.array(audio.get_array_of_samples()).astype(np.float32)

    # 형태를 (채널, 시간) 형식으로 만든다.
    if audio.channels == 2:
        samples = samples.reshape(-1, 2).T
    else:
        samples = samples.reshape(1, -1)

    # 들어가는 datatype이 tensor가 되어야 되기 때문에 바꿔준다.
    waveform = torch.from_numpy(samples)

    try:
        vad = vad_pipeline({"waveform": waveform, "sample_rate": audio.frame_rate})
    except Exception as e:
        print(f"Error performing VAD: {e}")
        return None

    speech_segments = vad.get_timeline().support()
    print(f"Detected {len(speech_segments)} speech segments in full audio.")

    speech_audio = AudioSegment.empty()
    for segment in speech_segments:
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)
        speech_audio += audio[start_ms:end_ms]

    speech_audio = speech_audio.set_frame_rate(target_sampling_rate).set_channels(1).set_sample_width(2)

    return speech_audio

def split_audio(audio, chunk_duration=600000):  # 600000 ms = 10 minutes
    print("Splitting audio into 10-minute chunks")
    duration = len(audio)
    chunks = [audio[i:i+chunk_duration] for i in range(0, duration, chunk_duration)]
    print(f"Audio split into {len(chunks)} chunks")
    return chunks

def perform_vad_on_chunk(audio_chunk, vad_pipeline):
    print("Performing VAD on chunk")
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    vad_pipeline = vad_pipeline.to(device)

    # 오디오 세그맨트를 넘파이로 바꿔준다
    samples = np.array(audio_chunk.get_array_of_samples()).astype(np.float32)

   # 형태를 (채널, 시간) 형식으로 만든다.
    if audio_chunk.channels == 2:
        samples = samples.reshape(-1, 2).T
    else:
        samples = samples.reshape(1, -1)

   # 들어가는 datatype이 tensor가 되어야 되기 때문에 바꿔준다.
    waveform = torch.from_numpy(samples)

    try:
        vad = vad_pipeline({"waveform": waveform, "sample_rate": audio_chunk.frame_rate})
    except Exception as e:
        print(f"Error performing VAD on chunk: {e}")
        return None

    speech_segments = vad.get_timeline().support()
    print(f"Detected {len(speech_segments)} speech segments in chunk.")

    speech_audio = AudioSegment.empty()
    for segment in speech_segments:
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)
        speech_audio += audio_chunk[start_ms:end_ms]

    return speech_audio

def transcribe_audio(model, audio_segment):
    print("Starting ASR transcription")
    try:
        # 오디오 세그맨트를 넘파이로 바꿔준다
        audio_array = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / 32768.0
        segments, _ = model.transcribe(audio_array, beam_size=5)
        segments = list(segments)
        print("ASR transcription completed")
        transcription_segments = [{"start": seg.start, "end": seg.end, "text": seg.text} for seg in segments]
        return transcription_segments
    except Exception as e:
        print(f"Error during ASR transcription: {e}")
        return None

def post_process_whisperx(transcription_segments, audio_segment, align_model, metadata, device='cuda'):
    print("Starting WhisperX post-processing")
    try:
        # 오디오 세그맨트를 넘파이로 바꿔준다
        audio_array = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / 32768.0
        result_aligned = whisperx.align(transcription_segments, align_model, metadata, audio_array, device=device)
        if isinstance(result_aligned, dict):
            aligned_segments = result_aligned.get("segments", [])
        else:
            print("Unknown return format from WhisperX")
            aligned_segments = []

        print(f"Aligned {len(aligned_segments)} segments.")
        print("WhisperX post-processing completed")
        return aligned_segments
    except Exception as e:
        print(f"Error during WhisperX post-processing: {e}")
        return None

def perform_diarization(vad_audio, pipeline, num_speakers):
    print("Starting speaker diarization")
    try:
        # 오디오 세그맨트를 넘파이로 바꿔준다
        audio_array = np.array(vad_audio.get_array_of_samples()).astype(np.float32) / 32768.0

        # 형태를 (채널, 시간) 형식으로 만든다.
        if vad_audio.channels == 2:
            audio_array = audio_array.reshape(-1, 2).T
        else:
            audio_array = audio_array.reshape(1, -1)

        diarization = pipeline({"waveform": torch.from_numpy(audio_array), "sample_rate": vad_audio.frame_rate}, num_speakers=num_speakers)
        print("Speaker diarization completed")
        return diarization
    except Exception as e:
        print(f"Error during speaker diarization: {e}")
        return None

def match_speaker_to_segments(diarization, transcription_segments):
    print("Matching speakers to transcription segments")
    matched_segments = []

    for segment in transcription_segments:
        midpoint = (segment['start'] + segment['end']) / 2
        speaker_found = False
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            if turn.start <= midpoint <= turn.end:
                matched_segments.append((segment['start'], segment['end'], f"참여자{speaker[7:]}", segment['text']))
                speaker_found = True
                break
        if not speaker_found:
            matched_segments.append((segment['start'], segment['end'], "알 수 없음", segment['text']))

    matched_segments.sort(key=lambda x: x[0])
    print("Speaker matching completed")
    return matched_segments

def save_transcriptions(matched_segments, output_file):
    print(f"Saving transcriptions to {output_file}")
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            for segment in matched_segments:
                start_time, end_time, speaker, text = segment
                f.write(f"{speaker}: {text}\n")
        print(f"Transcriptions saved to '{output_file}'")
    except Exception as e:
        print(f"Error saving transcriptions: {e}")

def process_chunk(chunk, whisper_model, align_model, metadata, vad_pipeline):
    print(f"Processing chunk")

    # VAD로 오디오 파일 전처리를 한다.( 빈 음성 부분을 제거한다. )
    vad_audio = perform_vad_on_chunk(chunk, vad_pipeline)
    if vad_audio is None:
        return None

    # ASR 전사처리를 한다.
    transcription_segments = transcribe_audio(whisper_model, vad_audio)
    if transcription_segments is None:
        return None

    # WhisperX로 후처리한다.
    aligned_segments = post_process_whisperx(transcription_segments, vad_audio, align_model, metadata)
    if aligned_segments is None:
        return None

    return aligned_segments

def main(input_audio, num_speakers, output_file, device='cuda'):
    print("Loading models")
    compute_type = "float16" if device == 'cuda' else "float32"
    whisper_model = WhisperModel("large-v2", device=device, compute_type=compute_type)

    align_model, metadata = whisperx.load_align_model(language_code='ko', device=device)

    vad_pipeline = Pipeline.from_pretrained(
        "pyannote/voice-activity-detection",
        use_auth_token='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'
    )

    diarization_pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token='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'
    )
    diarization_pipeline.to(torch.device(device))

    # 전체 오디오 파일을 VAD한 파일을 이용해서 화자분리를 진행한다.
    vad_audio = perform_vad_on_full_audio(input_audio, vad_pipeline)
    if vad_audio is None:
        print("VAD on full audio failed. Exiting.")
        return


    diarization = perform_diarization(vad_audio, diarization_pipeline, num_speakers)
    if diarization is None:
        print("Diarization failed. Exiting.")
        return

    # 청크로 나눈다.
    chunks = split_audio(vad_audio)

    all_aligned_segments = []
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}")
        chunk_segments = process_chunk(chunk, whisper_model, align_model, metadata, vad_pipeline)
        if chunk_segments:
            chunk_duration = len(chunk) / 1000  # chunk duration in seconds
            for seg in chunk_segments:
                seg['start'] += i * chunk_duration
                seg['end'] += i * chunk_duration
            all_aligned_segments.extend(chunk_segments)

    matched_segments = match_speaker_to_segments(diarization, all_aligned_segments)

    save_transcriptions(matched_segments, output_file)

    print("Processing completed")

    def Sample(output_text_file):
      data = open()

if __name__ == "__main__":
    input_audio_file = "/content/drive/MyDrive/whisper/20240925-회의록.wav"
    num_speakers = 4
    output_text_file = "/content/drive/MyDrive/whisper/20240925-회의록노트1.txt"

    main(
        input_audio=input_audio_file,
        num_speakers=num_speakers,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        output_file=output_text_file
    )
#저장된 회의록 텍스트 파일을 눈으로 결과물 확인을 위해 읽어온다.
#read를 통해서 문자열로 불러온다.
data = open('/content/drive/MyDrive/whisper/20240925-회의록노트1.txt', 'r', encoding="UTF8")
contents = data.read()
print(contents)
data.close()

Loading models


Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.4.1+cu121. Bad things might happen unless you revert torch to 1.x.
Performing VAD on full audio
Detected 198 speech segments in full audio.
Starting speaker diarization


  std = sequences.std(dim=-1, correction=1)


Speaker diarization completed
Splitting audio into 10-minute chunks
Audio split into 2 chunks
Processing chunk 1/2
Processing chunk
Performing VAD on chunk
Detected 13 speech segments in chunk.
Starting ASR transcription
ASR transcription completed
Starting WhisperX post-processing




Aligned 226 segments.
WhisperX post-processing completed
Processing chunk 2/2
Processing chunk
Performing VAD on chunk
Detected 8 speech segments in chunk.
Starting ASR transcription
ASR transcription completed
Starting WhisperX post-processing




Aligned 135 segments.
WhisperX post-processing completed
Matching speakers to transcription segments
Speaker matching completed
Saving transcriptions to /content/drive/MyDrive/whisper/20240925-회의록노트1.txt
Transcriptions saved to '/content/drive/MyDrive/whisper/20240925-회의록노트1.txt'
Processing completed
알 수 없음:  근데 아무리 봐도 멘토님이 정해주신 거 그렇게 생각이 안 나
참여자_02:  뭐 정해주셨지?
참여자_02:  회의록?
참여자_03:  회의록 그거 막 그냥 그냥
알 수 없음:  아 회의록
참여자_02:  쓸만한 게
참여자_02:  아 회의록을 정해주셨나 근데?
참여자_03:  의견을 내주시긴 하지
참여자_01:  1안부터 볼까?
참여자_01:  응 처음부터 보자
참여자_01:  해계도우미
참여자_03:  해계도우미가 부처화를 시킬 수는 있거든
참여자_01:  근데 돈에 관련된 거 하지 말자
참여자_01:  돈에 관련된 거 어쨌든 사람이 확인을 한 번 해야 되고
참여자_03:  이게 뭐 멘토님이 말해주기로는
참여자_03:  뭔가 이걸로 해서 뭔가
참여자_03:  코스트를 뭔가 획기적으로 줄일 수는 없다
참여자_03:  약간 이런 말씀을 해주셨기 때문에
참여자_03:  최종후보의 후보지 않나
알 수 없음:  넘어가야 될 거 같아
참여자_03:  뭐 이거는 일단 핸드폰 거치는 게 좀 힘드니까 아예 제기고
참여자_03:  3번은 내 생각에는 그냥
참여자_03:  이 기업 내부 자료
참여자_03:  그니까 기업의 내부 코드를 가지고 와야 되는 문제가 좀 있어서
참여자_03:  좀 조금 그런 데에서 좀 어려움이 있지 않을까?
참여자_03:  하는 생각이 들어요
참여자_03:  그냥 양을 늘리려고 넣었어
알 수 없음:  아 오