In [1]:
import os
import pickle
import librosa
import librosa.display
import matplotlib
import matplotlib.pyplot
import numpy as np
from tqdm.auto import tqdm
from IPython.display import Audio
import soundfile as sf
import os
import ffmpy
import warnings
import opensmile
import csv
import pandas as pd 

warnings.filterwarnings('ignore')


AUDIOS_FOLDER = "../data/audios/final_utterance_videos"
CONTEXT_AUDIOS_FOLDER = "../data/audios/final_context_videos"
AUDIOS_FOLDER_WAV = "../data/audios/final_utterance_videos_wav"
AUDIOS_FOLDER__BGR_WAV = "../data/audios/final_utterance_videos_bgr_wav"
CONTEXT_AUDIOS_FOLDER_WAV = "../data/audios/final_context_videos_wav"
CONTEXT_AUDIOS_FOLDER__BGR_WAV = "../data/audios/final_context_videos_bgr_wav"

## Convert mp4 to wav file and remove background noise

In [2]:
for filename in os.listdir(AUDIOS_FOLDER):
    actual_filename = filename.rsplit(".", maxsplit=1)[0]
    if(filename.endswith(".mp4")):
        os.system('ffmpeg -i {} -acodec pcm_s16le {}.wav'.format(
            os.path.join(AUDIOS_FOLDER, filename), os.path.join(AUDIOS_FOLDER_WAV, actual_filename)))
    else:
        continue

In [3]:
for filename in os.listdir(CONTEXT_AUDIOS_FOLDER):
    actual_filename = filename.rsplit(".", maxsplit=1)[0]
    if(filename.endswith(".mp4")):
        os.system('ffmpeg -i {} -acodec pcm_s16le {}.wav'.format(
            os.path.join(CONTEXT_AUDIOS_FOLDER, filename), os.path.join(CONTEXT_AUDIOS_FOLDER_WAV, actual_filename)))
    else:
        continue

## Librosa

Usually background noises are removed using frequency of foreground and background noise. This doesn't work in our case as the waveshow below shows that dialogue and background laughter have similar frequencies. We can sample foreground audio data to an extent using librosa.

In [8]:
feat_dict = {}
src_dir = AUDIOS_FOLDER_WAV
hop_length = 512

for f in os.listdir(src_dir):
    if f.endswith(".wav"):
        y, sr = librosa.load(os.path.join(src_dir, f))
        D = librosa.stft(y, hop_length=hop_length)
        S_full, phase = librosa.magphase(D)
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric="cosine")
        S_filter = np.minimum(S_full, S_filter)
        margin_i, margin_v = 2, 4
        power = 2
        mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power)
        S_foreground = mask_v * S_full
        new_D = S_foreground * phase
        y_foreground = librosa.istft(new_D)
        sf.write(os.path.join(AUDIOS_FOLDER__BGR_WAV, f), y_foreground, sr)
        mfcc = librosa.feature.mfcc(y=y_foreground, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        rms = librosa.feature.rms(y=y_foreground, hop_length=hop_length)
        rms_delta = librosa.feature.delta(rms)
        zcr = librosa.feature.zero_crossing_rate(y=y_foreground)
        zcr_delta = librosa.feature.delta(zcr)
        tone = librosa.feature.tonnetz(y=y_foreground, sr=sr)
        tempogram = librosa.feature.tempogram(y=y_foreground, sr=sr)
        tempogram_ratio = librosa.feature.tempogram_ratio(y=y_foreground, sr=sr)
        S = librosa.feature.melspectrogram(y=y_foreground, sr=sr, n_mels=128, fmax=8000)
        S_delta = librosa.feature.delta(S)
        spectral_centroid = librosa.feature.spectral_centroid(y=y_foreground, sr=sr, S=S_full)
        
        audio_feature = np.vstack((mfcc, mfcc_delta, rms, rms_delta, zcr, zcr_delta, tone, 
                                   tempogram, tempogram_ratio, S, S_delta, spectral_centroid))
        jump = int(audio_feature.shape[1] / 10)
        feat_data = librosa.util.sync(audio_feature, range(1, audio_feature.shape[1], jump))
        feat_dict[f] = feat_data


with open('feat_dict_librosa_lld.pickle', 'wb') as handle:
    pickle.dump(feat_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
feat_dict = {}
src_dir = CONTEXT_AUDIOS_FOLDER_WAV
hop_length = 512

for f in os.listdir(src_dir):
    if f.endswith(".wav"):
        y, sr = librosa.load(os.path.join(src_dir, f))
        D = librosa.stft(y, hop_length=hop_length)
        S_full, phase = librosa.magphase(D)
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric="cosine")
        S_filter = np.minimum(S_full, S_filter)
        margin_i, margin_v = 2, 4
        power = 2
        mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power)
        S_foreground = mask_v * S_full
        new_D = S_foreground * phase
        y_foreground = librosa.istft(new_D)
        sf.write(os.path.join(CONTEXT_AUDIOS_FOLDER__BGR_WAV, f), y_foreground, sr)
        mfcc = librosa.feature.mfcc(y=y_foreground, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        rms = librosa.feature.rms(y=y_foreground, hop_length=hop_length)
        rms_delta = librosa.feature.delta(rms)
        zcr = librosa.feature.zero_crossing_rate(y=y_foreground)
        zcr_delta = librosa.feature.delta(zcr)
        tone = librosa.feature.tonnetz(y=y_foreground, sr=sr)
        tempogram = librosa.feature.tempogram(y=y_foreground, sr=sr)
        tempogram_ratio = librosa.feature.tempogram_ratio(y=y_foreground, sr=sr)
        S = librosa.feature.melspectrogram(y=y_foreground, sr=sr, n_mels=128, fmax=8000)
        S_delta = librosa.feature.delta(S)
        spectral_centroid = librosa.feature.spectral_centroid(y=y_foreground, sr=sr, S=S_full)
        
        audio_feature = np.vstack((mfcc, mfcc_delta, rms, rms_delta, zcr, zcr_delta, tone, 
                                   tempogram, tempogram_ratio, S, S_delta, spectral_centroid))
        jump = int(audio_feature.shape[1] / 10)
        feat_data = librosa.util.sync(audio_feature, range(1, audio_feature.shape[1], jump))
        feat_dict[f] = feat_data


with open('feat_dict_context_librosa_lld.pickle', 'wb') as handle:
    pickle.dump(feat_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## OpenSmile 
Prosody features extraction 

OpenSmile shell script is processing only 493 wave files. Rest threw "Maybe this is not a WAVE file?" <br>
OpenSmile python is working fine.

In [4]:
feat_dict = {}

smile_lld = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

# # captured on whole for each video
# smile_predefined = opensmile.Smile(
#     feature_set=opensmile.FeatureSet.eGeMAPSv02,
#     feature_level=opensmile.FeatureLevel.Functionals,
# )
#+smile_predefined.feature_names
#     spredefined = smile_predefined.process_file(os.path.join(src_dir,f))
#     df_concat = pd.concat([slld, spredefined], axis=1)

df = pd.DataFrame(columns=smile_lld.feature_names)


src_dir = AUDIOS_FOLDER__BGR_WAV
for f in os.listdir(src_dir):
    if f.endswith(".wav"):
        spd = smile_lld.process_file(os.path.join(src_dir,f))
        feat_dict[f] = np.transpose(spd.to_numpy())
        df = df.append(spd)

In [5]:
df.to_csv("framewise_feat_opensmile_lld.csv")
with open('feat_dict_opensmile_lld.pickle', 'wb') as handle:
    pickle.dump(feat_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Frame config changes to align librosa and opensmile features
Check if we can merge opensmile and librosa features.  <br>
Try frame parameters for opensmile - https://github.com/audeering/opensmile-python/issues/14 <br>

Tried - <br>
https://github.com/audeering/opensmile-python/issues/76 <br>
https://github.com/audeering/opensmile-python/issues/57 <br>
Configs updated to have frame size of 93ms and frame step of 25ms based on Librosa paper https://conference.scipy.org/proceedings/scipy2015/pdfs/brian_mcfee.pdf <br>
Package path local for config changes - /Users/shreyaprabhu/opt/miniconda3/lib/python3.9/site-packages/opensmile <br>

### References 
https://github.com/soujanyaporia/MUStARD <br>
https://audeering.github.io/opensmile-python/usage.html <br>
https://github.com/ekayen/prosody_detection <br>

Other methods/libraries tried - 
1. SOX for converting mp4 to wav file. SOX does not have handler for mp4 as of now. https://github.com/jacksonh/sox.
2. Disvoice for prosody feature extraction. Did not work. Had package issues.
3. Myprosody library for prosody feature extraction. The code is in initial stages and has hardcoded file paths. Did not work. https://github.com/Shahabks/myprosody

In [6]:
os.path.dirname(opensmile.__file__)

'/Users/shreyaprabhu/opt/miniconda3/lib/python3.9/site-packages/opensmile'