In [25]:
from nemo.utils import model_utils
from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.models.ctc_models import EncDecCTCModel
import os
from glob import glob
import torch
import nemo.collections.asr as nemo_asr
import scipy.io.wavfile as wav
import numpy as np
import ctc_segmentation as cs
from typing import List
from tqdm import tqdm
import math
from pydub import AudioSegment

In [3]:
model_cfg = ASRModel.restore_from(restore_path='../models/nemo/hindi/Conformer-CTC-BPE-Large.nemo',
                                  return_config=True)
classpath = model_cfg.target
imported_class = model_utils.import_class_by_path(classpath)
asr_model = imported_class.restore_from(restore_path='../models/nemo/hindi/Conformer-CTC-BPE-Large.nemo')

      return torch._C._cuda_getDeviceCount() > 0
    


[NeMo I 2022-10-21 02:00:04 mixins:170] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2022-10-21 02:00:04 modelPT:142] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: ../../data/hindi_normalized/filtered_hindi_v1_manifest.json
    sample_rate: 16000
    batch_size: 24
    shuffle: true
    num_workers: 16
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 30
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: 8
    
[NeMo W 2022-10-21 02:00:04 modelPT:149] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /root/ekstep/nemo_exp/vakyansh-nemo-exp

[NeMo I 2022-10-21 02:00:04 features:225] PADDING: 0
[NeMo I 2022-10-21 02:00:06 save_restore_connector:243] Model EncDecCTCModelBPE was successfully restored from /home/anirudh/Desktop/forced-alignment/models/nemo/hindi/Conformer-CTC-BPE-Large.nemo.


In [4]:
#True if model is BPE
bpe_model = isinstance(asr_model, nemo_asr.models.EncDecCTCModelBPE)
if bpe_model:
    tokenizer = asr_model.tokenizer
else:
    tokenizer = None

In [5]:
vocabulary = ["ε"] + list(asr_model.cfg.decoder.vocabulary)
vocabulary

['ε',
 '<unk>',
 'ा',
 'र',
 'ी',
 'े',
 'न',
 'ि',
 'क',
 '▁',
 'त',
 '्',
 '▁स',
 'ल',
 'ं',
 'स',
 'म',
 '▁है',
 '▁क',
 'ु',
 'ह',
 'ग',
 'य',
 'ो',
 '▁ब',
 'द',
 'व',
 '▁म',
 '▁के',
 '्य',
 '▁प',
 '▁अ',
 '▁में',
 'प',
 '▁ज',
 'ू',
 '▁द',
 'ज',
 'ब',
 '▁कर',
 '▁व',
 'श',
 'ने',
 'च',
 '▁आ',
 '▁ह',
 '▁को',
 'ट',
 'ता',
 'ों',
 '▁और',
 '▁का',
 'ध',
 '▁की',
 '▁हो',
 '्र',
 'ए',
 '▁से',
 '▁कि',
 'थ',
 '▁हैं',
 '▁न',
 'ते',
 '▁हम',
 'ना',
 '▁प्र',
 '▁ल',
 '▁ग',
 '▁उ',
 'ें',
 '▁भ',
 'ै',
 'ई',
 '▁त',
 '▁च',
 '▁इस',
 'भ',
 '▁भी',
 '▁पर',
 '▁तो',
 '▁र',
 'ण',
 '▁उस',
 'ड',
 '▁रह',
 '▁जा',
 '▁जो',
 '▁नहीं',
 '▁आप',
 '▁श',
 'ड़',
 'ष',
 '▁एक',
 'िया',
 'ख',
 '्व',
 '्ट',
 '▁यह',
 '▁वि',
 'के',
 'ित',
 '▁इ',
 'छ',
 'फ',
 '़',
 'ँ',
 'ौ',
 'ठ',
 'झ',
 'ॉ',
 'इ',
 'ओ',
 'ऐ',
 'ढ',
 'घ',
 'आ',
 'ञ',
 'ऊ',
 'ऑ',
 'उ',
 'अ',
 'ः',
 'औ',
 'ॅ',
 'ऱ',
 'ऩ',
 'ऋ',
 'ङ',
 'ृ']

In [6]:
segment_file = 'sample.txt'
sample_rate, signal = wav.read('sample.wav')
original_duration = len(signal) / sample_rate

In [7]:
log_probs = asr_model.transcribe(paths2audio_files=['sample.wav'], batch_size=1, logprobs=True)[0]
blank_col = log_probs[:, -1].reshape((log_probs.shape[0], 1))
log_probs = np.concatenate((blank_col, log_probs[:, :-1]), axis=1)
index_duration = len(signal) / log_probs.shape[0] / sample_rate

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

    


In [8]:
with open(segment_file, "r") as f:
    text = f.read().splitlines()
text

['क्या सेंट मैरीस की एयर क्वालिटी घातक है ']

In [61]:
text = ['क्वालिटी']

In [62]:
config = cs.CtcSegmentationParameters()
config.char_list = vocabulary
config.min_window_size = 4000
config.index_duration = index_duration

In [63]:
def _prepare_tokenized_text_for_bpe_model(text: List[str], tokenizer, vocabulary: List[str], blank_idx: int = 0):
    """ Creates a transition matrix for BPE-based models"""
    space_idx = vocabulary.index("▁")
    ground_truth_mat = [[-1, -1]]
    utt_begin_indices = []
    for uttr in text:
        ground_truth_mat += [[blank_idx, space_idx]]
        utt_begin_indices.append(len(ground_truth_mat))
        token_ids = tokenizer.text_to_ids(uttr)
        # blank token is moved from the last to the first (0) position in the vocabulary
        token_ids = [idx + 1 for idx in token_ids]
        ground_truth_mat += [[t, -1] for t in token_ids]

    utt_begin_indices.append(len(ground_truth_mat))
    ground_truth_mat += [[blank_idx, space_idx]]
    ground_truth_mat = np.array(ground_truth_mat, np.int64)
    return ground_truth_mat, utt_begin_indices

def _print(ground_truth_mat, vocabulary, limit=20):
    """Prints transition matrix"""
    chars = []
    for row in ground_truth_mat:
        chars.append([])
        for ch_id in row:
            if ch_id != -1:
                chars[-1].append(vocabulary[int(ch_id)])

    for x in chars[:limit]:
        print("unknown")
        #logging.debug(x)

def determine_utterance_segments(config, utt_begin_indices, char_probs, timings, text, char_list):
    """Utterance-wise alignments from char-wise alignments.
    Adapted from https://github.com/lumaku/ctc-segmentation
    Args:
        config: an instance of CtcSegmentationParameters
        utt_begin_indices: list of time indices of utterance start
        char_probs:  character positioned probabilities obtained from backtracking
        timings: mapping of time indices to seconds
        text: list of utterances
    Return:
        segments, a list of: utterance start and end [s], and its confidence score
    """
    segments = []
    min_prob = np.float64(-10000000000.0)
    for i in tqdm(range(len(text))):
        start = _compute_time(utt_begin_indices[i], "begin", timings)
        end = _compute_time(utt_begin_indices[i + 1], "end", timings)

        start_t = start / config.index_duration_in_seconds
        start_t_floor = math.floor(start_t)

        # look for the left most blank symbol and split in the middle to fix start utterance segmentation
        if char_list[start_t_floor] == config.char_list[config.blank]:
            start_blank = None
            j = start_t_floor - 1
            while char_list[j] == config.char_list[config.blank] and j > start_t_floor - 20:
                start_blank = j
                j -= 1
            if start_blank:
                start_t = int(round(start_blank + (start_t_floor - start_blank) / 2))
            else:
                start_t = start_t_floor
            start = start_t * config.index_duration_in_seconds

        else:
            start_t = int(round(start_t))

        end_t = int(round(end / config.index_duration_in_seconds))

        # Compute confidence score by using the min mean probability after splitting into segments of L frames
        n = config.score_min_mean_over_L
        if end_t <= start_t:
            min_avg = min_prob
        elif end_t - start_t <= n:
            min_avg = char_probs[start_t:end_t].mean()
        else:
            min_avg = np.float64(0.0)
            for t in range(start_t, end_t - n):
                min_avg = min(min_avg, char_probs[t : t + n].mean())
        segments.append((start, end, min_avg))
    return segments

def _compute_time(index, align_type, timings):
    """Compute start and end time of utterance.
    Adapted from https://github.com/lumaku/ctc-segmentation
    Args:
        index:  frame index value
        align_type:  one of ["begin", "end"]
    Return:
        start/end time of utterance in seconds
    """
    middle = (timings[index] + timings[index - 1]) / 2
    if align_type == "begin":
        return max(timings[index + 1] - 0.5, middle)
    elif align_type == "end":
        return min(timings[index - 1] + 0.5, middle)

In [64]:
if bpe_model:
    ground_truth_mat, utt_begin_indices = _prepare_tokenized_text_for_bpe_model(text, tokenizer, vocabulary, 0)

In [65]:
config.blank = 0
timings, char_probs, char_list = cs.ctc_segmentation(config, log_probs, ground_truth_mat)

In [66]:
segments = determine_utterance_segments(config, utt_begin_indices, char_probs, timings, text, char_list)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1945.41it/s]


In [67]:
segments

[(2.1514285714285717, 2.968174603174603, -1.1160802831427645)]

In [68]:
utt_begin_indices

[2, 9]

In [69]:
for i, (word, segment) in enumerate(zip(text, segments)):
    print(word)
    print(segment)
    print('----')

क्वालिटी
(2.1514285714285717, 2.968174603174603, -1.1160802831427645)
----


In [79]:
def clip(wav, start, end):
    frames = AudioSegment.from_wav(wav)
    s = start*1000 + 500
    print(s)
    e = int(end*1000) + 50
    print(e)
    return frames[s:e]

In [80]:
clipped = clip('sample.wav', 2.151, 2.968)
clipped

2651.0
3018
