# Process SLURP dataset for 1SSI: OneStep speech instructor

In [1]:
import os
import json
import re
from pathlib import Path

from pathlib import PurePath
from pydub import AudioSegment

from nemo_text_processing.text_normalization.normalize import Normalizer
from nemo.collections import nlp as nemo_nlp

In [2]:
### Intitiate text normalizer and puctuator
normalizer = Normalizer(input_case='lower_cased', lang="en")
punctuator = nemo_nlp.models.PunctuationCapitalizationModel.from_pretrained("punctuation_en_distilbert")



[NeMo I 2023-11-15 02:17:05 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/punctuation_en_distilbert/versions/1.0.0rc1/files/punctuation_en_distilbert.nemo to /root/.cache/torch/NeMo/NeMo_1.21.0rc0/punctuation_en_distilbert/6bdea9786c4395fbbe02e4143d2e1cee/punctuation_en_distilbert.nemo
[NeMo I 2023-11-15 02:17:21 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-11-15 02:17:23 tokenizer_utils:130] Getting HuggingFace AutoTokenizer with pretrained_model_name: distilbert-base-uncased, vocab_file: /tmp/tmplhzxjccm/tokenizer.vocab_file, merges_files: None, special_tokens_dict: {}, and use_fast: False


(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
[NeMo W 2023-11-15 02:17:24 modelPT:258] You tried to register an artifact under config key=tokenizer.vocab_file but an artifact for it has already been registered.
[NeMo W 2023-11-15 02:17:24 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_audio: false
    audio_file: null
    sample_rate: 16000
    use_bucketing: true
    batch_size: 32
    preload_audios: true
    use_tarred_dataset: false
    label_info_save_dir: null
    text_file: text_train.txt
    labels_file: labels_train.txt
    tokens_in_batch: null
    max_seq_length: 128
    num_samples: -1
    use_cache: true
    cache_dir: null
    get_label_frequences: false
    verbose: true
    n_jobs: 0
    tar_metadata_file: null
    tar_shuffle_n: 1
    shard_strategy: scatter
    shuffle: true

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

[NeMo W 2023-11-15 02:17:27 punctuation_capitalization_model:719] The artifact `class_labels.punct_labels_file` was not found in checkpoint. Will rely on `punct_label_ids` parameter
[NeMo W 2023-11-15 02:17:27 punctuation_capitalization_model:741] The artifact `class_labels.capit_labels_file` was not found in checkpoint. Will rely on `capit_label_ids` parameter


[NeMo I 2023-11-15 02:17:27 save_restore_connector:249] Model PunctuationCapitalizationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/punctuation_en_distilbert/6bdea9786c4395fbbe02e4143d2e1cee/punctuation_en_distilbert.nemo.


In [3]:
def normalize(text):

    text = text.lower()
    normalized = normalizer.normalize(text, verbose=True, punct_post_process=True)
    normalized = [normalized]
    norm_punctuated = punctuator.add_punctuation_capitalization(normalized)[0]
    return norm_punctuated

In [4]:
slurp_annotations = Path("/n/disk1/audio_datasets/slurp/dataset/slurp/")
train_annotations = slurp_annotations / Path("train.jsonl")
dev_annotations = slurp_annotations / Path("devel.jsonl")
test_annotations = slurp_annotations / Path("test.jsonl")

audio_real = Path("/n/disk1/audio_datasets/slurp/audio/slurp_real")
audio_synth = Path("/n/disk1/audio_datasets/slurp/audio/slurp_synth/")


In [5]:
def convert_entity_format(text, tagdict):
    # Regular expression to find any entity type pattern
    pattern = r'\[([a-zA-Z_]+) : ([^\]]+)\]'

    # Function to replace the found pattern
    def replace_pattern(match):
        entity_type = match.group(1).strip().upper()  # Convert entity type to uppercase
        entity_value = match.group(2).strip()

        begin_tag = "B-{entity_type}"
        end_tag =  "E-{entity_type}"

        if begin_tag not in tagdict.keys():
            
            tagdict[begin_tag] = "DUMMY-"+str(len(tagdict.keys()) + 1)
            begin_tag = tagdict[begin_tag]
        else:
            begin_tag = tagdict[begin_tag]

        if end_tag not in tagdict.keys():
            
            tagdict[end_tag] = "DUMMY-"+str(len(tagdict.keys()) + 1)
            end_tag = tagdict[end_tag]
        else:
            end_tag = tagdict[end_tag]

        return f"{begin_tag} {entity_value} {end_tag}"

    # Replace all occurrences of the pattern in the text
    converted_text = re.sub(pattern, replace_pattern, text)

    return converted_text, tagdict



def add_entity_tags(input1, input2):
    # Find all entities in input2
    entities = re.findall(r'B-([A-Z_]+) (.*?) E-\1', input2)

    # Function to handle punctuation around the entity
    def replace_entity(match):
        before, entity, after = match.groups()
        return f"{before}B-{entity_type} {entity_value} E-{entity_type}{after}"

    # Replace the text in input1 with tagged text from input2
    for entity in entities:
        entity_type, entity_value = entity
        # Pattern to include possible punctuation around the entity
        pattern = r'(\W?)(\b' + re.escape(entity_value) + r'\b)(\W?)'
        input1 = re.sub(pattern, replace_entity, input1, 1, re.IGNORECASE)

    return input1



In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor
from AudioEmotionClassification.models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification

emotion_model = HubertForSpeechClassification.from_pretrained("Rajaram1996/Hubert_emotion")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
sampling_rate=16000 # defined by the model; must convert mp3 to this rate.
config = AutoConfig.from_pretrained("Rajaram1996/Hubert_emotion")

def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in
               enumerate(scores)]
    return outputs

def get_emotion_labels(audio_file, sampling_rate=16000, score=50.0):
    sound_array = speech_file_to_array_fn(audio_file, sampling_rate)
    
    inputs = feature_extractor(sound_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to("cpu").float() for key in inputs}

    with torch.no_grad():
        logits = emotion_model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]

    outputs = [{
        "emo": config.id2label[i],
        "score": round(score * 100, 1)}
        for i, score in enumerate(scores)
    ]

    #[{'emo': 'female_neutral', 'score': 73.9}, {'emo': 'female_happy', 'score': 24.8}]
    emotion_labels = [row for row in sorted(outputs, key=lambda x:x["score"], reverse=True) if row['score'] != '0.0%'][:2]

    all_labels = []
    for emotion_dict in emotion_labels:
        label = emotion_dict['emo'].split("_")[1].upper()
        score = emotion_dict['score']

        if score > 50.0:
            all_labels.append(label)

    return all_labels


(…)/Hubert_emotion/resolve/main/config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of the model checkpoint at Rajaram1996/Hubert_emotion were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSpeechClassification were not initialized from the model checkpoint at Rajaram1996/Hubert_emotion and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight

(…)60/resolve/main/preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [7]:
ALL_ENTITIES = {}


def jsonl_process(jsonlfile,audiofolder, tagdictfile):

    print(jsonlfile)

    wavfolder = str(audiofolder) + "-wav"
    os.system("mkdir -p "+wavfolder)
    wavfolder = Path(wavfolder)

    jsonlfileread = open(str(jsonlfile),'r').readlines()

    
    manifest = open(jsonlfile.name.replace(jsonlfile.suffix, "") + ".json",'w')

    tagdict = json.load(open(tagdictfile,'r'))

    for line in jsonlfileread:

        line = json.loads(line)
        print(line)
        annotation = line['sentence_annotation']
        text = line['sentence']
        text_clean = normalize(text)
        text_tagged, tagdict = convert_entity_format(line['sentence_annotation'], tagdict)
        text_clean_tagged = add_entity_tags(text_clean, text_tagged)
        
        intent = line['intent'].upper()

        recordings = line['recordings']

        print("Final text:", text_clean_tagged)

        for recording in recordings:
            audiofile = recording['file']
            audiofilepath = audiofolder / Path(audiofile)

            audiofile = PurePath(audiofile)
            filekey = audiofile.name.replace(audiofile.suffix, "")
            wavfilepath = str(wavfolder) + "/" + filekey + ".wav"
            #flac_tmp_audio_data = AudioSegment.from_file(audiofilepath, audiofilepath.suffix[1:])
            #flac_tmp_audio_data.export(wavfilepath, format="wav")
            
            
            print(audiofilepath)

            sample_dict = {}
            sample_dict['audiofilepath'] = wavfilepath
            sample_dict['text'] = text_clean
            sample_dict['tagged_text'] = text_clean

            flac_tmp_audio_data = AudioSegment.from_file(audiofilepath, audiofilepath.suffix[1:])
            flac_tmp_audio_data.export(wavfilepath, format="wav")
            sample_dict['instruction'] = "transcribe speech"

            json.dump(sample_dict, manifest)
            manifest.write("\n")

            sample_dict['tagged_text'] = text_clean_tagged
            sample_dict['instruction'] = "transcribe and mark entities"
            json.dump(sample_dict, manifest)
            manifest.write("\n")


            emotion_labels = get_emotion_labels(audio_file=wavfilepath, sampling_rate=16000)
            emotion_labels = ' '.join(emotion_labels)

            final_transcription = text_clean_tagged + " " + emotion_labels

            sample_dict['tagged_text'] = final_transcription
            sample_dict['instruction'] = "transcribe, mark entitites and track speaker emotion"
            json.dump(sample_dict, manifest)
            manifest.write("\n")

            sample_dict['tagged_text'] = text_clean_tagged + " " + intent
            sample_dict['instruction'] = "transcribe, mark entitites, get speaker intent"
            json.dump(sample_dict, manifest)
            manifest.write("\n")

            sample_dict['tagged_text'] = final_transcription + " " + intent
            sample_dict['instruction'] = "transcribe, mark entitites, get intent and emotion labels"
            json.dump(sample_dict, manifest)
            manifest.write("\n")        
    
    manifest.close()         



In [None]:
jsonl_process(jsonlfile=train_annotations, audiofolder=audio_real)
#jsonl_process(jsonlfile=train_annotations, audiofolder=audio_real)
#jsonl_process(jsonlfile=train_annotations, audiofolder=audio_real)
