# Automatic Speech Recognition with Speaker Diarization

In [None]:
!pip install wget
!apt-get -y install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html
!pip install pydub

In [None]:
BRANCH = 'main'
!python -m pip install git+https://github.com/motawie0/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] 

In [None]:
import numpy as np
from IPython.display import Audio, display
import librosa
import os
import wget
import matplotlib.pyplot as plt
import glob
import pandas as pd
import pprint
from omegaconf import OmegaConf
import shutil
import os
import csv
import argparse
import nemo.collections.asr as nemo_asr
import json
import nemo
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
import os
from IPython.display import clear_output
import torch
pp = pprint.PrettyPrinter(indent=4)

# diarizaer

In [None]:
data_dir ='/kaggle/working'

In [None]:
DOMAIN_TYPE = "telephonic" # Can be meeting or telephonic based on domain type of the audio file
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"

CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"

if not os.path.exists(os.path.join(data_dir,CONFIG_FILE_NAME)):
    CONFIG = wget.download(CONFIG_URL, data_dir)
else:
    CONFIG = os.path.join(data_dir,CONFIG_FILE_NAME)

cfg = OmegaConf.load(CONFIG)
print(OmegaConf.to_yaml(cfg))

In [None]:
input_path = "/kaggle/input/diarization-manifest/input_manifest.json"
output_path = "/kaggle/working/split_manifest"
lines_per_part = 1  # Set this to the number of lines you want per file

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

def split_json_file(input_path, output_path, lines_per_part):
    # Read all lines from the file
    with open(input_path, 'r') as file:
        lines = file.readlines()

    # Calculate the number of parts needed
    total_lines = len(lines)
    num_parts = (total_lines + lines_per_part - 1) // lines_per_part  # Ensure all lines are covered
    print(f"Total lines: {total_lines}, Lines per part: {lines_per_part}, Total parts: {num_parts}")

    # Split and write to new files
    for part in range(num_parts):
        start = part * lines_per_part
        end = min(start + lines_per_part, total_lines)  # Avoid going out of range
        part_file_path = f"{output_path}/part_{part + 1}.json"

        # Write the current part to its file
        with open(part_file_path, 'w') as part_file:
            for line in lines[start:end]:
                part_file.write(line)
        print(f"Part {part + 1} written to {part_file_path}")

# Example usage
split_json_file(input_path, output_path, lines_per_part)

In [None]:
pretrained_speaker_model='titanet_large'
cfg.diarizer.out_dir = data_dir #Directory to store intermediate files and prediction outputs
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
cfg.diarizer.clustering.parameters.oracle_num_speakers=False
cfg.batch_size=1
cfg.diarizer.msdd_model.parameters.infer_batch_size=1
cfg.diarizer.asr.parameters.asr_batch_size=1
# Using Neural VAD and Conformer ASR 
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
cfg.diarizer.asr.model_path = "/kaggle/input/the-best-results/results/Some name of our experiment/checkpoints/conformer.nemo" 
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD 
cfg.diarizer.asr.parameters.asr_based_vad = False
# cfg.diarizer.asr.ctc_decoder_parameters.pretrained_language_model = '/kaggle/working/5gram.bin'
cfg.diarizer.ignore_overlap=False

In [None]:
# Assuming that you are using CUDA
import json
too_big = []
for manifest_file in os.listdir("/kaggle/working/split_manifest"):
    file_path = f'/kaggle/working/split_manifest/{manifest_file}'
    print(file_path)
    with open(file_path, 'r') as file:
        dur = json.load(file)['duration']
    if dur > 200:
        too_big.append(file_path)
        continue
    cfg.diarizer.manifest_filepath = file_path
    asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
    asr_model = asr_decoder_ts.set_asr_model()
    word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
    asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
    asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
    diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
    trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
    clear_output()

In [None]:
def transcribe_audio(checkpoint_path, data_dir, output_csv='transcriptions.csv', batch_size=4):
    # Restore the ASR model from the checkpoint
    asr_model = nemo_asr.models.EncDecCTCModel.restore_from(checkpoint_path)

    # List all .wav files in the directory
    wav_files = [f for f in os.listdir(data_dir) if f.endswith('.wav')]

    # Prepare the list of audio paths
    audio_paths = [os.path.join(data_dir, wav) for wav in wav_files]

    # Transcribe the audio files in batches
    transcriptions = []
    for i in range(0, len(audio_paths), batch_size):
        batch_paths = audio_paths[i:i + batch_size]
        transcripts = asr_model.transcribe(audio=batch_paths, batch_size=len(batch_paths))
        transcriptions.extend(transcripts)
    print(transcriptions)
    # Prepare data for CSV
    csv_data = []
    for wav, transcript in zip(wav_files, transcriptions):
        audio_name = os.path.splitext(wav)[0]
        csv_data.append([audio_name, transcript])

    # Write to CSV
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['audio', 'transcript'])
        writer.writerows(csv_data)

    print(f"Transcriptions saved to {output_csv}")

In [None]:
from omegaconf import OmegaConf, DictConfig
from omegaconf import OmegaConf
import wget
# msdd_model_path ='/kaggle/input/msdd-10-epochs-on-3k3h-2-4/nemo_experiments/MultiscaleDiarDecoder/2024-07-13_12-33-15/checkpoints/MultiscaleDiarDecoder--val_loss=0.8137-epoch=1.ckpt' 
! git clone https://github.com/NVIDIA/NeMo.git
!rm /kaggle/working/diar_infer_telephonic.yaml
MODEL_CONFIG = os.path.join("/kaggle/working/",'diar_infer_telephonic.yaml')
if not os.path.exists(MODEL_CONFIG):
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"
    MODEL_CONFIG = wget.download(config_url,"/kaggle/working/")

output_dir = "/kaggle/working/output_inference"
config = OmegaConf.load(MODEL_CONFIG)
config.diarizer.out_dir="/kaggle/working/output_inference"
config.diarizer.manifest_filepath='/kaggle/working/input_manifest.json'
config.diarizer.oracle_vad=False
config.diarizer.clustering.parameters.oracle_num_speakers = False
OmegaConf.save(config, "/kaggle/working/diar_infer_telephonic.yaml")


In [None]:
from pydub import AudioSegment

def crop_audio(input_wav, start_ms, end_ms):
    audio = AudioSegment.from_wav(input_wav)
    audio = audio.set_frame_rate(16000)
    cropped_audio = audio[start_ms*1000:end_ms*1000]
    cropped_audio.export("/kaggle/working/temp_wav_output/croped_file.wav", format="wav")

In [None]:
!mkdir /kaggle/working/temp_wav_output
!mkdir /kaggle/working/temp_wav
!mkdir /kaggle/working/long_audio_json
asr_model = nemo_asr.models.EncDecCTCModel.restore_from("/kaggle/input/the-best-results/results/Some name of our experiment/checkpoints/conformer.nemo")

for file in too_big:
    !rm -rf /kaggle/working/output_inference/pred_rttms
    with open(file, 'r') as file:
        audio_path = json.load(file)['audio_filepath']
    meta = {
    'audio_filepath': audio_path, 
    'offset': 0, 
    'duration':None, 
    'label': 'infer', 
    'text': '-', 
    'num_speakers': None, 
    'rttm_filepath': None, 
    'uem_filepath' : None
    }
    with open('input_manifest.json','w') as fp:
        json.dump(meta,fp)
        fp.write('\n')
    
    !HYDRA_FULL_ERROR=1 python /kaggle/working/NeMo/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py --config-path /kaggle/working --config-name diar_infer_telephonic.yaml
    rttm_file_path = os.listdir("/kaggle/working/output_inference/pred_rttms")
    data = []
    with open("/kaggle/working/output_inference/pred_rttms/"+rttm_file_path[0], 'r') as file_pred_rttm:
        for line in file_pred_rttm:
            parts = line.strip().split()
            if parts[0] == "SPEAKER":
                # Extract the start time and duration
                start_time = float(parts[3])
                end = start_time +  float(parts[4])
                speaker = parts[7]
                crop_audio(audio_path,start_time,end)
                transcripts = asr_model.transcribe(audio="/kaggle/working/temp_wav_output/croped_file.wav", batch_size=1)[0]
                data.append((start_time,end,speaker,transcripts))
    segments = [{"start": start, "end": end, "speaker": speaker, "text": text} for start, end, speaker, text in data]
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    output_json_path = f'/kaggle/working/long_audio_json/{audio_filename}.json'
    print(f"dum in {output_json_path}")
    # Write the JSON data to a file
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(segments, f, ensure_ascii=False, indent=4)

In [None]:
import os
import json
import re

# Function to convert time in "MM:SS.SS" format to seconds
def time_to_seconds(minutes, seconds):
    return int(minutes) * 60 + float(seconds)

# Function to process a single file and convert it to JSON
def process_file(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    
    # Regular expression to parse the data
    pattern = re.compile(r'\[(\d{2}):(\d{2}\.\d{2}) - (\d{2}):(\d{2}\.\d{2})\] (speaker_\d+): (.+)')

    # Parse the input data and convert to JSON format
    segments = []
    for match in pattern.finditer(data):
        start_minutes, start_seconds, end_minutes, end_seconds, speaker, text = match.groups()
        start_time = time_to_seconds(start_minutes, start_seconds)
        end_time = time_to_seconds(end_minutes, end_seconds)
        segments.append({
            "start": start_time,
            "end": end_time,
            "speaker": speaker,
            "text": text
        })

    # Write the JSON data to a file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(segments, f, ensure_ascii=False, indent=4)

# Paths to the input and output directories
input_directory = '/kaggle/working/pred_rttms'
output_directory = '/kaggle/working/convert_to_jason_small_audio'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Process each .txt file in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        input_file_path = os.path.join(input_directory, filename)
        output_file_name = os.path.splitext(filename)[0] + '.json'
        output_file_path = os.path.join(output_directory, output_file_name)
        
        # Process the file and convert to JSON
        process_file(input_file_path, output_file_path)

        print(f"Processed {input_file_path} -> {output_file_path}")


In [None]:
!ls /kaggle/working/convert_to_jason_small_audio
!ls /kaggle/working/long_audio_json