In [1]:


! pip install Cython numpy==1.19.3
! apt-get install gcc
! apt-get install libsndfile1
! conda install -c anaconda pyaudio
! apt install ffmpeg

"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
try:
    import nemo.collections.asr as nemo_asr
except ModuleNotFoundError:
  ! python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]

[NeMo W 2020-11-21 18:37:47 experimental:28] Module <class 'nemo.collections.asr.losses.ctc.CTCLoss'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-21 18:37:49 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-21 18:37:49 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-21 18:37:49 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-21 18:37:49 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production

In [1]:
import nemo.collections.asr as nemo_asr

[NeMo W 2020-11-22 20:29:50 experimental:28] Module <class 'nemo.collections.asr.losses.ctc.CTCLoss'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 20:29:52 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 20:29:52 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 20:29:52 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 20:29:52 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production

In [2]:
import json
import os
import wget

from IPython.display import Audio
import numpy as np
import scipy.io.wavfile as wav

# ! pip install pandas

# optional
# ! pip install plotly
from plotly import graph_objects as go

# Introduction
End-to-end Automatic Speech Recognition (ASR) systems surpassed traditional systems in performance but require large amounts of labeled data for training. 

This tutorial will show how to use a pre-trained with Connectionist Temporal Classification (CTC) ASR model, such as [QuartzNet Model](https://arxiv.org/abs/1910.10261) to split long audio files and the corresponding transcripts into shorter fragments that are suitable for an ASR model training. 

We're going to use [ctc-segmentation](https://github.com/lumaku/ctc-segmentation) Python package based on the algorithm described in [CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition](https://arxiv.org/pdf/2007.09127.pdf).

In [3]:
! pip install ctc_segmentation==1.1.0

Collecting ctc_segmentation==1.1.0
  Downloading ctc_segmentation-1.1.0.tar.gz (57 kB)
[K     |████████████████████████████████| 57 kB 2.5 MB/s eta 0:00:011
Building wheels for collected packages: ctc-segmentation
  Building wheel for ctc-segmentation (setup.py) ... [?25ldone
[?25h  Created wheel for ctc-segmentation: filename=ctc_segmentation-1.1.0-cp37-cp37m-linux_x86_64.whl size=109724 sha256=16d17762b394c61e4f13eff1a7e5ffbe9ae09de279171fba3c57ab75808d1afb
  Stored in directory: /root/.cache/pip/wheels/1e/83/d2/41997ef6d47d8a8b72358c340cfede437cc0c1f62e56407b8a
Successfully built ctc-segmentation
Installing collected packages: ctc-segmentation
Successfully installed ctc-segmentation-1.1.0


In [16]:
# If you're running the notebook locally, update the TOOLS_DIR path below
# In Colab, a few required scripts will be downloaded from NeMo github

TOOLS_DIR = 'NeMo/tools/ctc_segmentation/scripts'

if 'google.colab' in str(get_ipython()):
    TOOLS_DIR = 'scripts/'
    os.makedirs(TOOLS_DIR, exist_ok=True)

    required_files = ['prepare_data.py',
                    'normalization_helpers.py',
                    'run_ctc_segmentation.py',
                    'verify_segments.py',
                    'cut_audio.py',
                    'process_manifests.py',
                    'utils.py']
    for file in required_files:
        if not os.path.exists(os.path.join(TOOLS_DIR, file)):
            file_path = 'https://raw.githubusercontent.com/NVIDIA/NeMo/main/tools/ctc_segmentation/' + TOOLS_DIR + file
            print(file_path)
            wget.download(file_path, TOOLS_DIR)
elif not os.path.exists(TOOLS_DIR):
      raise ValueError(f'update path to NeMo root directory')

`TOOLS_DIR` should now contain scripts that we are going to need in the next steps, all necessary scripts could be found [here](https://github.com/NVIDIA/NeMo/tree/main/tools/ctc_segmentation/scripts).

In [17]:
print(TOOLS_DIR)
! ls -l $TOOLS_DIR

NeMo/tools/ctc_segmentation/scripts
total 64
drwxrwxrwx 1 root root  4096 Nov 21 18:52 __pycache__
-rwxrwxrwx 1 root root  4047 Nov 22 20:45 clean_text.py
-rwxrwxrwx 1 root root 12651 Nov 19 18:23 cut_audio.py
-rwxrwxrwx 1 root root  2109 Nov 19 18:23 normalization_helpers.py
-rwxrwxrwx 1 root root 10103 Nov 22 20:49 prepare_data.py
-rwxrwxrwx 1 root root  2741 Nov 19 18:23 process_manifests.py
-rwxrwxrwx 1 root root  7081 Nov 19 18:23 run_ctc_segmentation.py
-rwxrwxrwx 1 root root  6029 Nov 19 18:23 utils.py
-rwxrwxrwx 1 root root  5359 Nov 19 18:23 verify_segments.py


# Data Download
First, let's download an audio file from [https://librivox.org/](https://librivox.org/).

In [18]:
## create data directory and download an audio file
WORK_DIR = 'WORK_DIR'
DATA_DIR = WORK_DIR + '/DATA'
os.makedirs(DATA_DIR, exist_ok=True)
audio_file = 'childrensshortworks019_06acarriersdog_am_128kb.mp3'
if not os.path.exists(os.path.join(DATA_DIR, audio_file)):
    print('Downloading audio file')
    wget.download('http://archive.org/download/childrens_short_works_vol_019_1310_librivox/' + audio_file, DATA_DIR)

Next, we need to get the corresponding transcript.

Note, the text file and the audio file should have the same base name, for example, an audio file `example.wav` or `example.mp3` should have corresponding text data stored under `example.txt` file.

In [19]:
# text source: http://www.gutenberg.org/cache/epub/24263/pg24263.txt
text =  """
    A carrier on his way to a market town had occasion to stop at some houses
    by the road side, in the way of his business, leaving his cart and horse
    upon the public road, under the protection of a passenger and a trusty
    dog. Upon his return he missed a led horse, belonging to a gentleman in
    the neighbourhood, which he had tied to the end of the cart, and likewise
    one of the female passengers. On inquiry he was informed that during his
    absence the female, who had been anxious to try the mettle of the pony,
    had mounted it, and that the animal had set off at full speed. The carrier
    expressed much anxiety for the safety of the young woman, casting at the
    same time an expressive look at his dog. Oscar observed his master's eye,
    and aware of its meaning, instantly set off in pursuit of the pony, which
    coming up with soon after, he made a sudden spring, seized the bridle, and
    held the animal fast. Several people having observed the circumstance, and
    the perilous situation of the girl, came to relieve her. Oscar, however,
    notwithstanding their repeated endeavours, would not quit his hold, and
    the pony was actually led into the stable with the dog, till such time as
    the carrier should arrive. Upon the carrier entering the stable, Oscar
    wagged his tail in token of satisfaction, and immediately relinquished the
    bridle to his master.
    """

with open(os.path.join(DATA_DIR, audio_file.replace('mp3', 'txt')), 'w') as f:
    f.write(text)

The `DATA_DIR` should now contain both audio and text files:

In [20]:
!ls -l $DATA_DIR

total 2076
-rw-r--r-- 1 root root 2121264 Nov 22 20:30 childrensshortworks019_06acarriersdog_am_128kb.mp3
-rw-r--r-- 1 root root    1423 Nov 22 20:50 childrensshortworks019_06acarriersdog_am_128kb.txt


Listen to the audio:

In [21]:
Audio(os.path.join(DATA_DIR, audio_file))

In [23]:
os.path.join(DATA_DIR, audio_file)

'WORK_DIR/DATA/childrensshortworks019_06acarriersdog_am_128kb.mp3'

As one probably noticed, the audio file contains a prologue and an epilogue that are missing in the corresponding text. The segmentation algorithm could handle extra audio fragments at the end and the beginning of the audio, but prolonged untranscribed audio segments in the middle of the file could deteriorate segmentation results. That's why to improve the segmentation quality, it is recommended to normalize text, so that transcript contains spoken equivalents of abbreviations and numbers.

# Prepare Text and Audio

We're going to use `prepare_data.py` script to prepare both text and audio data for segmentation.

Text preprocessing:
* the text will be split into sentences and stored under '$OUTPUT_DIR/processed/*.txt' where each sentence is going to start with a new line (we're going to find alignments for these sentences in the next steps)
* out-of-vocabulary words will be removed based on pre-trained ASR model vocabulary, (optionally) text will be changed to lowercase 
* sentences for alignment with the original punctuation and capitalization will be stored under  `$OUTPUT_DIR/processed/*_with_punct.txt`
* numbers will be normalized in a naive way to replace, for example, `12` with `one two`. Such normalization is usually enough for proper segmentation but to build a high-quality training dataset, all out-vocabulary symbols should be replaced with their actual spoken representations.

Audio preprocessing:
* `.mp3` files will be converted to `.wav` files
* audio files will be resampled to use the same sampling rate as was used to pre-train the ASR model we're using for alignment
* stereo tracks will be converted to mono
* since librivox.org audio contains relatively long prologues, we're also cutting a few seconds from the beginning of the audio files (optional step, see `--cut_prefix` argument). In some cases, if an audio contains a very long untranscribed prologue, increasing `--cut_prefix` value might help improve segmentation quality.


The `prepare_data.py` will preprocess all `.txt` files found in the `--in_text=$DATA_DIR` and all `.mp3` files located at `--audio_dir=$DATA_DIR`.


In [75]:
MODEL = 'QuartzNet15x5Base-En'
OUTPUT_DIR = WORK_DIR + '/output'

! python $TOOLS_DIR/prepare_data.py \
--in_text=$DATA_DIR \
--output_dir=$OUTPUT_DIR/processed/ \
--language='eng' \
--cut_prefix=4 \
--model=$MODEL \
--audio_dir=$DATA_DIR

[NeMo W 2020-11-22 21:26:35 experimental:28] Module <class 'nemo.collections.asr.losses.ctc.CTCLoss'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 21:26:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 21:26:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 21:26:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 21:26:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production

The following three files should be generated and stored at the `$OUTPUT_DIR/processed` folder:
* childrensshortworks019_06acarriersdog_am_128kb.txt
* childrensshortworks019_06acarriersdog_am_128kb.wav
* childrensshortworks019_06acarriersdog_am_128kb_with_punct.txt

In [46]:
! ls -l $OUTPUT_DIR/processed

total 10980
-rw-r--r-- 1 root root     3875 Nov 22 00:28 childrensshortworks019_06acarriersdog_am_128kb.txt
-rw-r--r-- 1 root root 11233370 Nov 22 00:28 childrensshortworks019_06acarriersdog_am_128kb.wav
-rw-r--r-- 1 root root     3817 Nov 22 00:28 childrensshortworks019_06acarriersdog_am_128kb_with_punct.txt


The `.txt` file without punctuation contains preprocessed text phrases that we're going to align within the audio file. Here, we split the text into sentences. Each line should contain a text snippet for alignment.

In [47]:
with open(os.path.join(OUTPUT_DIR, 'processed', audio_file.replace('.mp3', '.txt')), 'r') as f:
    for line in f:
        print (line)

welcome to the final session of rapporteur group on working methods

i see that so far the temperature of the discussions hasn't melted any of the snow

so our way back home may be as adventurous as our way in this morning 

looking back to our agenda which is tdone three four  revision one  i would like to go over what my plan would be for this afternoon

so we have only this one session remaining of one and a quarter hours

so i was taking stock of what we can accomplish

we have so far been through a lot of material and we i think we would anticipate progressing aone  aone  further through e meetings and i will go over the plan for that

we have two other documents that are aone  related which i think we can deal with fairly quickly given that we are not having any issue to determine aone  at this meeting and i discussed with korea these two documents will go very quickly 

where i would like to spend most of the time with interpretation is going through two of the documents

so we 

# Run CTC-Segmentation

In this step, we're going to use the [`ctc-segmentation`](https://github.com/lumaku/ctc-segmentation) to find the start and end time stamps for the segments we created during the previous step.


As described in the [CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition](https://arxiv.org/pdf/2007.09127.pdf), the algorithm is relying on a CTC-based ASR model to extract utterance segments with exact time-wise alignments. For this tutorial, we're using a pre-trained 'QuartzNet15x5Base-En' model.

In [48]:
WINDOW = 8000

! python $TOOLS_DIR/run_ctc_segmentation.py \
--output_dir=$OUTPUT_DIR \
--data=$OUTPUT_DIR/processed \
--model=$MODEL \
--window_len=$WINDOW \
--no_parallel

[NeMo W 2020-11-22 00:28:08 experimental:28] Module <class 'nemo.collections.asr.losses.ctc.CTCLoss'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:09 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:09 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:09 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:09 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production

`WINDOW` parameter might need to be adjusted depending on the length of the utterance one wants to align, the default value should work in most cases.

Let's take a look at the generated alignments.
The expected output for our audio sample with 'QuartzNet15x5Base-En' model looks like this:

```
<PATH_TO>/processed/childrensshortworks019_06acarriersdog_am_128kb.wav
16.03 32.39 -4.5911999284929115 | a carrier on ... a trusty dog. | ...
33.31 45.01 -0.22886803973405373 | upon his ... passengers. | ...
46.17 58.57 -0.3523662826061572 | on inquiry ... at full speed. | ...
59.75 69.43 -0.04128918756038118 | the carrier ... dog. | ...
69.93 85.31 -0.3595261826390344 | oscar observed ... animal fast. | ...
85.95 93.43 -0.04447770533708611 | several people ... relieve her. | ...
93.61 105.95 -0.07326174931639003 | oscar however ... arrive. | ...
106.65 116.91 -0.14680841514778062 | upon the carrier ... his master. | ...
```

Details of the file content:
- the first line of the file contains the path to the original audio file
- all subsequent lines contain:
  * the first number is the start of the segment (in seconds)
  * the second one is the end of the segment (in seconds)
  * the third value - alignment confidence score (in log space)
  * text fragments corresponding to the timestamps
  * original text without pre-processing

In [31]:
alignment_file = str(WINDOW) + '_' + audio_file.replace('.mp3', '_segments.txt')
! cat $OUTPUT_DIR/segments/$alignment_file

WORK_DIR/output/processed/childrensshortworks019_06acarriersdog_am_128kb.wav
16.03 32.39 -4.573511169297092 | a carrier on his way to a market town had occasion to stop at some houses by the road side in the way of his business leaving his cart and horse upon the public road under the protection of a passenger and a trusty dog | A carrier on his way to a market town had occasion to stop at some houses by the road side, in the way of his business, leaving his cart and horse upon the public road, under the protection of a passenger and a trusty dog.
33.31 45.01 -0.2288700499072945 | upon his return he missed a led horse belonging to a gentleman in the neighbourhood which he had tied to the end of the cart and likewise one of the female passengers | Upon his return he missed a led horse, belonging to a gentleman in the neighbourhood, which he had tied to the end of the cart, and likewise one of the female passengers.
46.17 58.57 -0.35288047197945976 | on inquiry he was informed that du

Finally, we're going to split the original audio file into segments based on the found alignments. We're going to create three subsets and three corresponding manifests:
* high scored clips (segments with the segmentation score above the threshold value, default threshold value = -5)
* low scored clips (segments with the segmentation score below the threshold)
* deleted segments (segments that were excluded during the alignment. For example, in our sample audio file, the prologue and epilogue that don't have the corresponding transcript were excluded. Oftentimes, deleted files also contain such things as clapping, music, or hard breathing. 

The alignment score values depend on the pre-trained model quality and the dataset, the `THRESHOLD` parameter might be worth adjusting based on the analysis of the low/high scored clips.

Also note, that the `OFFSET` parameter is something one might want to experiment with since timestamps have a delay (offset) depending on the model.


In [49]:
OFFSET = 0
THRESHOLD = -5

! python $TOOLS_DIR/cut_audio.py \
--output_dir=$OUTPUT_DIR \
--model=$MODEL \
--alignment=$OUTPUT_DIR/segments/ \
--threshold=$THRESHOLD \
--offset=$OFFSET

[NeMo W 2020-11-22 00:28:34 experimental:28] Module <class 'nemo.collections.asr.losses.ctc.CTCLoss'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 00:28:35 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production

`manifests` folder should be created under `OUTPUT_DIR`, and it should contain
corresponding manifests for the three groups of clips described above:

In [33]:
! ls -l $OUTPUT_DIR/manifests

total 12
-rw-r--r-- 1 root root 1624 Nov 21 19:04 8000_childrensshortworks019_06acarriersdog_am_128kb_del_manifest.json
-rw-r--r-- 1 root root 5853 Nov 21 19:04 8000_childrensshortworks019_06acarriersdog_am_128kb_high_score_manifest.json
-rw-r--r-- 1 root root    0 Nov 21 19:04 8000_childrensshortworks019_06acarriersdog_am_128kb_low_score_manifest.json


In [33]:
def plot_signal(signal, sample_rate):
    """ Plot the signal in time domain """
    fig_signal = go.Figure(
        go.Scatter(x=np.arange(signal.shape[0])/sample_rate,
                   y=signal, line={'color': 'green'},
                   name='Waveform',
                   hovertemplate='Time: %{x:.2f} s<br>Amplitude: %{y:.2f}<br><extra></extra>'),
        layout={
            'height': 200,
            'xaxis': {'title': 'Time, s'},
            'yaxis': {'title': 'Amplitude'},
            'title': 'Audio Signal',
            'margin': dict(l=0, r=0, t=40, b=0, pad=0),
        }
    )
    fig_signal.show()
    
def display_samples(manifest):
    """ Display audio and reference text."""
    with open(manifest, 'r') as f:
        for line in f:
            sample = json.loads(line)
            sample_rate, signal = wav.read(sample['audio_filepath'])
            plot_signal(signal, sample_rate)
            display(Audio(sample['audio_filepath']))
            display('Reference text:       ' + sample['text_no_preprocessing'])
            display('ASR transcript: ' + sample['transcript'])
            print('\n' + '-' * 110)

Let's examine the high scored segments we obtained.

The `Reference text` in the next cell represents the original text without pre-processing, while `ASR transcript` is an ASR model prediction with greedy decoding. Also notice, that `ASR transcript` in some cases contains errors that could decrease the alignment score, but usually it doesn’t hurt the quality of the aligned segments.

In [None]:
high_score_manifest = str(WINDOW) + '_' + audio_file.replace('.mp3', '_high_score_manifest.json')
display_samples(os.path.join(OUTPUT_DIR, 'manifests', high_score_manifest))

# Multiple files alignment

Up until now, we were processing only one file at a time, but to create a large dataset processing of multiple files simultaneously could help speed up things considerably. 

Let's download another audio file and corresponding text.

In [34]:
# https://librivox.org/frost-to-night-by-edith-m-thomas/
audio_file_2 = 'frosttonight_thomas_bk_128kb.mp3'
if not os.path.exists(os.path.join(DATA_DIR, audio_file_2)):
    print('Downloading audio file')
    wget.download('http://www.archive.org/download/frost_to-night_1710.poem_librivox/frosttonight_thomas_bk_128kb.mp3', DATA_DIR)


# text source: text source: https://www.bartleby.com/267/151.html
text =  """
    APPLE-GREEN west and an orange bar,	
    And the crystal eye of a lone, one star …	
    And, “Child, take the shears and cut what you will,	
    Frost to-night—so clear and dead-still.”	
    
    Then, I sally forth, half sad, half proud,	        
    And I come to the velvet, imperial crowd,	
    The wine-red, the gold, the crimson, the pied,—	
    The dahlias that reign by the garden-side.	
    
    The dahlias I might not touch till to-night!	
    A gleam of the shears in the fading light,	        
    And I gathered them all,—the splendid throng,	
    And in one great sheaf I bore them along.
    .    .    .    .    .    .
    
    In my garden of Life with its all-late flowers	
    I heed a Voice in the shrinking hours:	
    “Frost to-night—so clear and dead-still” …	        
    Half sad, half proud, my arms I fill.	
    """

with open(os.path.join(DATA_DIR, audio_file_2.replace('mp3', 'txt')), 'w') as f:
  f.write(text)

Downloading audio file


`DATA_DIR` should now contain two .mp3 files and two .txt files:

In [35]:
! ls -l $DATA_DIR

total 3652
-rw-r--r-- 1 root root 2121264 Nov 21 18:46 childrensshortworks019_06acarriersdog_am_128kb.mp3
-rw-r--r-- 1 root root    1423 Nov 21 18:46 childrensshortworks019_06acarriersdog_am_128kb.txt
-rw-r--r-- 1 root root 1606724 Nov 21 19:06 frosttonight_thomas_bk_128kb.mp3
-rw-r--r-- 1 root root     864 Nov 21 19:06 frosttonight_thomas_bk_128kb.txt


In [36]:
Audio(os.path.join(DATA_DIR, audio_file_2))

Finally, we need to download a script to perform all the above steps starting from the text and audio preprocessing to segmentation and manifest creation in a single step.

In [38]:
if 'google.colab' in str(get_ipython()) and not os.path.exists('run_sample.sh'):
    wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/main/tools/ctc_segmentation/run_sample.sh', '.')

In [14]:
! ls -l

total 74408
-rwxrwxrwx 1 root root  5070979 Nov 21  2020 CTC_Segmentation_Tutorial.ipynb
drwxrwxrwx 1 root root     4096 Nov 19 18:23 NeMo
drwxr-xr-x 1 root root     4096 Nov 21 20:42 WORK_DIR
-rw-r--r-- 1 root root     1636 Nov 21 20:30 fine_timestamp.ipynb
-rwxrwxrwx 1 root root 71113101 Nov  5 12:33 nemo100_ft.nemo


Next, we're going to execute `run_sample.sh` script to find alignment for two audio/text samples. By default, if the alignment is not found for an initial WINDOW size, the initial window size will be doubled a few times to re-attempt alignment. 

`run_sample.sh` applies two initial WINDOW sizes, 8000 and 12000, and then adds segments that were similarly aligned with two window sizes to `verified_segments` folder. This could be useful to reduce the amount of manual work while checking the alignment quality.

In [53]:
import os 
WORK_DIR = 'WORK_DIR'
OUTPUT_DIR_2 = os.path.join(WORK_DIR, 'output_multiple_files')
MODEL ='./nemo100_NR_ft.nemo' # './nemo100_ft.nemo' # 'QuartzNet15x5Base-En'
TOOLS_DIR = 'NeMo/tools/ctc_segmentation/scripts'
DATA_DIR = WORK_DIR + '/DATA'
THRESHOLD = -5


! bash $TOOLS_DIR/../run_sample.sh \
--MODEL_NAME_OR_PATH=$MODEL \
--DATA_DIR=$DATA_DIR \
--OUTPUT_DIR=$OUTPUT_DIR_2 \
--SCRIPTS_DIR=$TOOLS_DIR \
--CUT_PREFIX=4 \
--MIN_SCORE=$THRESHOLD

MODEL_NAME_OR_PATH = ./nemo100_NR_ft.nemo
DATA_DIR = WORK_DIR/DATA
OUTPUT_DIR = WORK_DIR/output_multiple_files
MIN_SCORE = -5
CUT_PREFIX = 4
SCRIPTS_DIR = NeMo/tools/ctc_segmentation/scripts
OFFSET = 0
LANGUAGE = eng
[NeMo W 2020-11-22 10:08:26 experimental:28] Module <class 'nemo.collections.asr.losses.ctc.CTCLoss'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 10:08:27 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 10:08:27 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-11-22 10:08:27 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production an

Input #0, mp3, from 'WORK_DIR/DATA/TSAG_Rapporteur_Group_Afternoon_28_02_0.mp3':
  Duration: 01:15:07.24, start: 0.069063, bitrate: 47 kb/s
Input #0, mp3, from 'WORK_DIR/DATA/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0.mp3':
  Duration: 00:57:19.51, start: 0.069063, bitrate: 45 kb/s
    Stream #0:0: Audio: mp3, 16000 Hz, mono, fltp, 47 kb/s
    Metadata:
      encoder         : LAME3.100
    Stream #0:0: Audio: mp3, 16000 Hz, mono, fltp, 45 kb/s
    Metadata:
      encoder         : LAME3.100
Stream mapping:
  Stream #0:0 -> #0:0 (mp3 (mp3float) -> pcm_s16le (native))
Press [q] to stop, [?] for help
Stream mapping:
  Stream #0:0 -> #0:0 (mp3 (mp3float) -> pcm_s16le (native))
Press [q] to stop, [?] for help
Output #0, wav, to 'WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0.wav':
  Metadata:
    ISFT            : Lavf58.20.100
Output #0, wav, to 'WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_Afte

IndexError: Backtracking was not successful, the window size might be too small.
Increasing the window size to: 16000
INFO:worker:Process-3 completed segmentation of WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0.wav, segments saved to WORK_DIR/output_multiple_files/segments/8000_TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0_segments.txt
INFO:worker:Process-3 completed segmentation of WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0.wav, segments saved to WORK_DIR/output_multiple_files/segments/8000_TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0_segments.txt
Process-3 completed segmentation of WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0.wav, segments saved to WORK_DIR/output_multiple_files/segments/8000_TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0_segments.txt
INFO:w

INFO:root:CTC segmentation of 32292 chars to 6818.96s audio (170474 indices).
INFO:root:CTC segmentation of 32292 chars to 6818.96s audio (170474 indices).
CTC segmentation of 32292 chars to 6818.96s audio (170474 indices).
INFO:root:CTC segmentation of 44638 chars to 8973.12s audio (224328 indices).
INFO:root:CTC segmentation of 44638 chars to 8973.12s audio (224328 indices).
CTC segmentation of 44638 chars to 8973.12s audio (224328 indices).
IndexError: Backtracking was not successful, the window size might be too small.
Increasing the window size to: 24000
INFO:worker:Process-2 completed segmentation of WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_Afternoon_28_02_0.wav, segments saved to WORK_DIR/output_multiple_files/segments/12000_TSAG_Rapporteur_Group_Afternoon_28_02_0_segments.txt
INFO:worker:Process-2 completed segmentation of WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_Afternoon_28_02_0.wav, segments saved to WORK_DIR/output_multiple_files/

      normalized, onesided, return_complex)
    
      normalized, onesided, return_complex)
    
[NeMo I 2020-11-22 10:45:38 collections:173] Dataset loaded with 0 files totalling 0.00 hours
[NeMo I 2020-11-22 10:45:38 collections:174] 0 files were filtered totalling 0.00 hours
High score files duration: 3999s or ~67min at /home/NeMo_CTC_DatasetBuilder/WORK_DIR/output_multiple_files/manifests
Low score files duration: 0s or ~0min saved at /home/NeMo_CTC_DatasetBuilder/WORK_DIR/output_multiple_files/manifests
Saved DEL files duration: 504s or ~ 8min at /home/NeMo_CTC_DatasetBuilder/WORK_DIR/output_multiple_files/deleted_clips
Cutting WORK_DIR/output_multiple_files/processed/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0.wav based on WORK_DIR/output_multiple_files/verified_segments/TSAG_Rapporteur_Group_on_the_review_of_WTSA_Resolutions_1_03_0_segments.txt
Original duration: 3435s or ~57min
[NeMo I 2020-11-22 10:45:40 collections:173] Dataset loaded with 355 files totall

High scored manifests for the data samples were aggregated to the `all_manifest.json` under `OUTPUT_DIR_2`.

In [None]:
display_samples(os.path.join(OUTPUT_DIR_2, 'all_manifest.json'))

# Next Steps

Check out [NeMo Speech Data Explorer tool](https://github.com/NVIDIA/NeMo/tree/main/tools/speech_data_explorer#speech-data-explorer) to interactively evaluate the aligned segments.

# References
Kürzinger, Ludwig, et al. ["CTC-Segmentation of Large Corpora for German End-to-End Speech Recognition."](https://arxiv.org/abs/2007.09127) International Conference on Speech and Computer. Springer, Cham, 2020.

In [66]:
import os 
root_path = 'e:/docker/dataset/MODIFIED_IBM_Training_Data/'
srts= [os.path.join(root_path, x) for x in os.listdir(root_path) if x.endswith('srt')]
for srt in srts:
    with open (srt, 'r') as rdr:
        lines = rdr.readlines()
        
    all_text = ''
    for i in range(0, len(lines),4):
        all_text = all_text + lines[i+2].strip() + ' '
    all_text.strip()
    
    with open (srt.replace('.srt', '.txt'), 'w') as wrt:
        wrt.write(all_text)        

In [4]:
from datetime import datetime, timezone

def to_time(millisecond):
    x =  datetime.fromtimestamp(millisecond, tz=timezone.utc).time()
    h = x.hour
    m = x.minute
    s = x.second
    if len(str(h))<2:
        h ='0' + str(h)
    if len(str(m))<2:
        m ='0' + str(m)
    if len(str(s))<2:
        s ='0' + str(s)
    return '{}:{}:{}.{}'.format(h,m,s,'00')


In [5]:

val = '754.4300000000001 779.86'
for timestamp in val.split(' '):
    timestamp = float(timestamp)
    print (datetime.fromtimestamp(timestamp, tz=timezone.utc).time())



00:12:34.430000
00:12:59.860000
