# Automatic Speech Recognition with Speaker Diarization

In [None]:
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR

# import numpy as np
from IPython.display import Audio, display
import librosa
import os
import wget

# import nemo
import glob

import pprint
pp = pprint.PrettyPrinter(indent=4)

from audio_extract import extract_audio
import os, errno
from pydub import AudioSegment
import numpy as np 
from tqdm import tqdm
import math
from omegaconf import OmegaConf
import json

import logging
logging.getLogger('nemo_logger').setLevel(logging.ERROR)

In [2]:
ROOT = os.getcwd()
data_dir = os.path.join(ROOT, 'data')
os.makedirs(data_dir, exist_ok=True)

videos_dir = os.path.join(data_dir, 'videos')
os.makedirs(videos_dir, exist_ok=True)

audios_dir = os.path.join(data_dir, 'audios')
os.makedirs(audios_dir, exist_ok=True)

text_dir = os.path.join(data_dir,'text')
os.makedirs(text_dir, exist_ok=True)

list_videos = os.listdir(videos_dir)

#### Data preprocessing

In [None]:
# extract audio from video
for video_file in list_videos:
    try:
        extract_audio(input_path=os.path.join(videos_dir, video_file), 
                    output_path=os.path.join(audios_dir, video_file.split('.')[0] + ".wav") , 
                    output_format='wav')
        
        audio = AudioSegment.from_wav(os.path.join(audios_dir, video_file.split('.')[0] + ".wav"))
        split_audio = audio
        split_audio = split_audio.set_frame_rate(16000)                
        split_audio = split_audio.set_channels(1)
        split_audio.export(os.path.join(audios_dir, video_file.split('.')[0] + ".wav"), format="wav")
    except:
        print("файл уже существует")

audio_file_list = glob.glob(f"{audios_dir}/*.wav")

### Parameter setting for ASR and diarization

In [5]:
config_dir = os.path.join(data_dir,'config')
os.makedirs(config_dir, exist_ok=True)

text_dir = os.path.join(data_dir,'text')
os.makedirs(text_dir, exist_ok=True)

DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"

CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"

if not os.path.exists(os.path.join(config_dir, CONFIG_FILE_NAME)):
    CONFIG = wget.download(CONFIG_URL, config_dir)
else:
    CONFIG = os.path.join(config_dir,CONFIG_FILE_NAME)

cfg = OmegaConf.load(CONFIG)
pretrained_speaker_model='titanet_large'
cfg.diarizer.out_dir = data_dir # Directory to store intermediate files and prediction outputs
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
cfg.diarizer.clustering.parameters.oracle_num_speakers = False

# Using Neural VAD and Conformer ASR 
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
cfg.diarizer.asr.model_path = 'stt_ru_quartznet15x5'
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD 
cfg.diarizer.asr.parameters.asr_based_vad = False

### Run model

In [None]:
ROOT = os.getcwd()
data_dir_local = os.path.join(data_dir,'cache_samples_data')
os.makedirs(data_dir_local, exist_ok=True)

for audio_filename in audio_file_list:

    meta = {
    'audio_filepath': audio_filename, 
    'offset': 0, 
    'duration':None,
    'label': 'infer', 
    'text': '-', 
    'num_speakers': None, 
    'rttm_filepath': None, 
    'uem_filepath' : None
    }

    with open(os.path.join(config_dir,'input_manifest.json'),'w') as fp:
        json.dump(meta,fp)
        fp.write('\n')

    cfg.diarizer.manifest_filepath = os.path.join(config_dir,'input_manifest.json')

    asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
    asr_model = asr_decoder_ts.set_asr_model()
    word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)

    asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
    asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset

    diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)

    sum_text = ''
    for data in diar_hyp[audio_filename.split('/')[-1][:-4]]:
        speaker = data.split('speaker_')[1]
        t1, t2 = float(data.split(' ')[0]) * 1000, float(data.split(' ')[1]) * 1000
        audio = AudioSegment.from_wav(audio_filename)
        split_audio = audio[t1:t2]
        # split_audio = split_audio.set_frame_rate(16000)                
        # split_audio = split_audio.set_channels(1)
        split_audio = split_audio.split_to_mono()[0]
        split_audio.export(os.path.join(data_dir_local, 'sample.wav'), format="wav")
        path_to_audio_sample = os.path.join(data_dir_local, 'sample.wav')

        try:
            asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_ru_fastconformer_hybrid_large_pc", map_location='cuda', )
            text = asr_model.transcribe([path_to_audio_sample], batch_size=8)[0]
            if type(text) == list:
                sum_text += 'speaker_' + speaker + ' ' + text[0] + '\n'
            else:
                sum_text += 'speaker_' + speaker + ' ' + text + '\n'
        except:
            pass

    with open(os.path.join(text_dir, f"{audio_filename.split('/')[-1][:-4]}текст.txt"), "w") as file:
        file.write(sum_text)