# Creating .wav files
Creating audio files to extract features from the .mp4 files

In [10]:
import os
from moviepy.editor import VideoFileClip
from tqdm import tqdm

folder_path = 'mustard++/final_utterance_videos/final_utterance_videos'  # location of video files
audio_folder_path = 'mustard++/final_utterance_audios'  # location to save audio files

video_files = os.listdir(folder_path)

for video_file in tqdm(video_files):
    if video_file.endswith('.mp4'):
        video_path = os.path.join(folder_path, video_file)
        audio_path = os.path.join(audio_folder_path, video_file.replace('.mp4', '.wav'))
        
        video_clip = VideoFileClip(video_path) # Load video
        audio_clip = video_clip.audio      # Get audio from video
        audio_clip.write_audiofile(audio_path, verbose=False, logger=None)   # save audio


100%|██████████████████████████████████████████████████████████████████████████████| 1203/1203 [05:57<00:00,  3.36it/s]


# Processing

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

file_path = "mustard++/dataframe.csv"

df = pd.read_csv(file_path)
df = df.iloc[:, :-5] # dataframe contains non relevant columns
mask = df['KEY'].str[-1] == 'u'  # Create a mask to filter rows where the last character of the 'KEY' column is 'u'
df_new = df[mask] # Create a new DataFrame 'df_new' containing only the rows that satisfy the mask condition

In [2]:
import re

def clean_text(text):
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\W+', ' ', text)  # remove all non-word characters
    return text

# apply the function to the 'SENTENCE' column
df_new.loc[:, 'SENTENCE'] = df_new['SENTENCE'].apply(clean_text)

In [3]:
# Function that gets time in proper format required
def standardize_end_time(time_str):
    minutes, seconds = time_str.split(':')
    if len(seconds) == 1:
        seconds = seconds + '0'
    return minutes + ':' + seconds

# END_TIME in proper format
df_new.loc[:, 'END_TIME'] = df_new['END_TIME'].apply(standardize_end_time)

# Get words per minute of each speaker
def words_per_minute(row):
    num_words = len(row['SENTENCE'].split())  # number of words spoken
    seconds = float(row['END_TIME'].split(':')[-1])  # get seconds from 'END_TIME'
    duration_minutes = seconds / 60.0  # words per minute
    return num_words / duration_minutes


# create a new column 'words_per_minute'
df_new.loc[:, 'words_per_minute'] = df_new.apply(words_per_minute, axis=1)

df_new.head(25)

Unnamed: 0,SCENE,KEY,SENTENCE,END_TIME,SPEAKER,SHOW,Sarcasm,words_per_minute
5,1_10004,1_10004_u,And of those few months how long have you been...,0:07,SHELDON,BBT,0.0,120.0
14,1_10009,1_10009_u,Let the dead man talk So why do you think that,0:05,PENNY,BBT,0.0,132.0
18,1_1001,1_1001_u,What else Sell it on eBay as slightly used,0:04,RAJ,BBT,0.0,135.0
24,1_1003,1_1003_u,Good idea sit with her Hold her comfort her An...,0:08,HOWARD,BBT,1.0,165.0
31,1_10190,1_10190_u,Well now that I ve given up string theory I m ...,0:11,SHELDON,BBT,0.0,190.909091
36,1_10462,1_10462_u,You know if you re really serious about that I...,0:09,HOWARD,BBT,1.0,160.0
42,1_10495,1_10495_u,And if it turns out you don t you and I could ...,0:07,AMY,BBT,1.0,154.285714
49,1_10496,1_10496_u,You and Oh sure And while we re at it why don...,0:14,SHELDON,BBT,1.0,162.857143
56,1_105,1_105_u,I m just inferring this is a couch because the...,0:06,SHELDON,BBT,1.0,210.0
63,1_10748,1_10748_u,There they go fighting again You d never hear ...,0:07,AMY,BBT,1.0,137.142857


# Feature Extraction

### Function that extracts features from audio files

In [4]:
import numpy as np
import librosa

def extract_features(file_name):
    # Load the audio file
    y, sr = librosa.load(file_name, sr=22050)

    # compute all relevant features with their mean, median and standard deviation to preserve information
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    delta_mfccs_mean = np.mean(librosa.feature.delta(mfccs), axis=1)
    delta_mfccs_median = np.median(librosa.feature.delta(mfccs), axis=1)
    delta_mfccs_std = np.std(librosa.feature.delta(mfccs), axis=1)
    mfccs_mean = np.mean(mfccs, axis=1)
    mfccs_std = np.std(mfccs, axis=1)
    mfccs_median = np.median(mfccs, axis=1)
    
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=20)
    delta_melspectrogram_mean = np.mean(librosa.feature.delta(melspectrogram), axis=1)
    delta_melspectrogram_median = np.median(librosa.feature.delta(melspectrogram), axis=1)
    delta_melspectrogram_std = np.std(librosa.feature.delta(melspectrogram), axis=1)
    melspectrogram_mean = np.mean(melspectrogram, axis=1)
    melspectrogram_std = np.std(melspectrogram, axis=1)
    melspectrogram_median = np.median(melspectrogram, axis=1)
    
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
    delta_spectral_centroids_mean = np.mean(librosa.feature.delta(spectral_centroids), axis=1)
    delta_spectral_centroids_median = np.median(librosa.feature.delta(spectral_centroids), axis=1)
    delta_spectral_centroids_std = np.std(librosa.feature.delta(spectral_centroids), axis=1)
    spectral_centroids_mean = np.mean(spectral_centroids, axis=1)
    spectral_centroids_std = np.std(spectral_centroids, axis=1)
    spectral_centroids_median = np.median(spectral_centroids, axis=1)
    
    intensity_mean = np.mean(librosa.feature.rms(y=y)[0])
    intensity_median = np.median(librosa.feature.rms(y=y)[0])
    intensity_std = np.std(librosa.feature.rms(y=y)[0])
    zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(y=y), axis=1)
    zero_crossing_rate_median = np.median(librosa.feature.zero_crossing_rate(y=y), axis=1)
    zero_crossing_rate_std = np.std(librosa.feature.zero_crossing_rate(y=y), axis=1)

    # Return the features in a dictionary for easier use
    return {
        'mfccs_mean': mfccs_mean,
        'mfccs_std': mfccs_std,
        'mfccs_median': mfccs_median,
        'melspectrogram_mean': melspectrogram_mean,
        'melspectrogram_std': melspectrogram_std,
        'melspectrogram_median': melspectrogram_median,
        'spectralcentroids_mean': spectral_centroids_mean,
        'spectralcentroids_std': spectral_centroids_std,
        'spectralcentroids_median': spectral_centroids_median,
        'deltamfccs_mean': delta_mfccs_mean,
        'deltamfccs_median': delta_mfccs_median,
        'deltamfccs_std': delta_mfccs_std,
        'deltamelspectrogram_mean': delta_melspectrogram_mean,
        'deltamelspectrogram_median': delta_melspectrogram_median,
        'deltamelspectrogram_std': delta_melspectrogram_std,
        'deltaspectral_centroids_mean': delta_spectral_centroids_mean,
        'deltaspectral_centroids_median': delta_spectral_centroids_median,
        'deltaspectral_centroids_std': delta_spectral_centroids_std,
        'intensity_mean': intensity_mean,
        'intensity_median': intensity_median,
        'intensity_std': intensity_std,
        'zerocrossingrate_mean': zero_crossing_rate_mean,
        'zerocrossingrate_median': zero_crossing_rate_median,
        'zerocrossingrate_std': zero_crossing_rate_std
    }


### Get features from audio files

In [5]:
import os
import pandas as pd
from tqdm import tqdm

audio_folder_path = 'mustard++/final_utterance_audios' 

# Get a list of all .wav files in the directory
audio_files = [f for f in os.listdir(audio_folder_path) if f.endswith('.wav')]
features = []

# Iterate over each audio file
for audio_file in tqdm(audio_files):
    audio_path = os.path.join(audio_folder_path, audio_file)

    # Extract the audio features
    audio_features = extract_features(audio_path)
    
    # Append the audio file name and features to the list
    features.append([audio_file] + list(audio_features.values()))

# Column names for the DataFrame
columns = [
    'audio_file',
    'mfccs',
    'mfccs_std',
    'mfccs_median',
    'melspectrogram',
    'melspectrogram_std',
    'melspectrogram_median',
    'spectralcentroids',
    'spectralcentroids_std',
    'spectralcentroids_median',
    'deltamfccs',
    'deltamfccs_median',
    'deltamfccs_std',
    'deltamelspectrogram',
    'deltamelspectrogram_median',
    'deltamelspectrogram_std',
    'deltaspectral_centroids',
    'deltaspectral_centroids_median',
    'deltaspectral_centroids_std',
    'intensity',
    'intensity_median',
    'intensity_std',
    'zerocrossingrate',
    'zerocrossingrate_median',
    'zerocrossingrate_std'
]

# Create a DataFrame from the features
df_audio_features = pd.DataFrame(features, columns=columns)
df_new['KEY'] = df_new['KEY'] + '.wav'

# Merge the dataframes based on the matching columns
merged_df = pd.merge(df_audio_features, df_new[['KEY', 'words_per_minute']], left_on='audio_file', right_on='KEY', how='left')

# Drop 'KEY' column
merged_df.drop('KEY', axis=1, inplace=True)
df_audio_features = merged_df

# Save the DataFrame to a CSV file
df_audio_features.to_csv('audio_features.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 1203/1203 [00:57<00:00, 21.10it/s]


In [6]:
df_audio_features.head()

Unnamed: 0,audio_file,mfccs,mfccs_std,mfccs_median,melspectrogram,melspectrogram_std,melspectrogram_median,spectralcentroids,spectralcentroids_std,spectralcentroids_median,...,deltaspectral_centroids,deltaspectral_centroids_median,deltaspectral_centroids_std,intensity,intensity_median,intensity_std,zerocrossingrate,zerocrossingrate_median,zerocrossingrate_std,words_per_minute
0,1_10004_u.wav,"[-243.93507, 70.85715, -37.16033, 19.805502, -...","[150.40671, 64.558105, 39.007534, 20.610617, 2...","[-211.50174, 100.61767, -36.10334, 19.469513, ...","[1.842888, 2.8543434, 1.851097, 3.000899, 2.45...","[3.1057982, 4.914852, 4.0059843, 6.3437533, 4....","[0.64533687, 1.0727369, 0.5431957, 0.19532783,...",[2733.071442417542],[1324.4128766386518],[2016.4799114199873],...,[-13.020707259205258],[-0.7766345679290669],[197.25843669086098],0.059431,0.052855,0.04393,[0.18516011757425743],[0.134765625],[0.13142955577890436],120.0
1,1_10009_u.wav,"[-199.60484, 106.24419, -67.8631, 5.480549, -3...","[64.31671, 31.946226, 22.312222, 13.83655, 12....","[-191.82799, 117.13129, -69.155556, 5.1786084,...","[0.233226, 0.46897775, 0.673891, 2.4721177, 2....","[0.28630817, 0.5365221, 0.84131473, 4.338407, ...","[0.14141616, 0.24753219, 0.34082642, 1.0788522...",[2089.5369769145177],[560.156091062625],[1883.0498477120238],...,[-4.16050281107283],[-3.9674393923882327],[91.74549822497335],0.048186,0.04419,0.024762,[0.12766642065092165],[0.1201171875],[0.03587249883454199],132.0
2,1_1001_u.wav,"[-77.95845, 85.00533, -20.893145, 31.380796, -...","[109.05715, 46.23271, 36.034065, 24.593624, 17...","[-64.481804, 97.277374, -25.793314, 31.96978, ...","[35.884064, 166.97185, 140.98672, 83.13356, 17...","[58.040565, 275.39273, 192.87573, 169.2475, 33...","[15.039975, 27.542643, 35.716175, 5.561665, 1....",[2451.219397748192],[1221.8683898863928],[1956.8135268328379],...,[-11.572871575733654],[-6.619796584243026],[234.7665723652445],0.288145,0.320339,0.182242,[0.15002907323473283],[0.1142578125],[0.12172414220832796],135.0
3,1_1003_u.wav,"[-98.87288, 110.46051, -17.9312, 35.882313, -1...","[81.712105, 49.463013, 21.829575, 23.866018, 1...","[-90.738014, 116.52541, -15.497877, 33.759544,...","[59.094326, 127.67833, 140.49104, 47.30064, 13...","[91.818214, 218.10068, 208.34738, 91.98717, 47...","[16.755945, 39.679222, 41.219772, 6.88536, 1.2...",[1885.900930002495],[913.4243553375466],[1667.0119941915236],...,[1.3523010495418615],[14.914905191532721],[174.17572267909014],0.256163,0.257658,0.158541,[0.1034351856033237],[0.07470703125],[0.08065494617692873],165.0
4,1_10190_u.wav,"[-298.76166, 64.262024, -43.251045, 22.255568,...","[110.42542, 53.923443, 37.69831, 27.16364, 25....","[-273.43484, 73.92064, -46.028866, 20.311419, ...","[1.4806372, 2.0299723, 1.7340355, 0.9274265, 0...","[4.3184457, 4.0385804, 4.1758575, 2.845616, 1....","[0.3052403, 0.50268286, 0.2635303, 0.045790523...",[2596.007799339712],[1141.9737137632037],[2210.024776580029],...,[-4.523149304417859],[5.0296790031533005],[233.93190170227365],0.039448,0.037176,0.029028,[0.16875719572368422],[0.134765625],[0.10812024988810727],190.909091


# Formatting dataframe

In [7]:
import numpy as np

# determine the columns that contain 1D array
array_cols = [col for col in df_audio_features.columns if isinstance(df_audio_features[col][0], np.ndarray)]

# create new dataframe to hold the expanded features
df_expanded = pd.DataFrame()

# for each column that contains 1D array
for col in array_cols:
    # expand the array into separate columns and add to df_expanded
    expanded_col = pd.DataFrame(df_audio_features[col].to_list(), 
                                 columns=[f'{col}_{i}' for i in range(len(df_audio_features[col][0]))])
    df_expanded = pd.concat([df_expanded, expanded_col], axis=1)

# drop the original columns that contain 1D array from df_audio_features
df_audio_features = df_audio_features.drop(columns=array_cols)

# concatenate df_audio_features and df_expanded along the columns
df_audio_features = pd.concat([df_audio_features, df_expanded], axis=1)

In [8]:
df_audio_features.head()

Unnamed: 0,audio_file,intensity,intensity_median,intensity_std,words_per_minute,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,...,deltamelspectrogram_std_16,deltamelspectrogram_std_17,deltamelspectrogram_std_18,deltamelspectrogram_std_19,deltaspectral_centroids_0,deltaspectral_centroids_median_0,deltaspectral_centroids_std_0,zerocrossingrate_0,zerocrossingrate_median_0,zerocrossingrate_std_0
0,1_10004_u.wav,0.059431,0.052855,0.04393,120.0,-243.935074,70.857147,-37.160332,19.805502,-22.752916,...,0.019585,0.003203,0.000882,0.000643,-13.020707,-0.776635,197.258437,0.18516,0.134766,0.13143
1,1_10009_u.wav,0.048186,0.04419,0.024762,132.0,-199.604843,106.244186,-67.863098,5.480549,-34.111588,...,0.003284,0.000476,0.000306,8.8e-05,-4.160503,-3.967439,91.745498,0.127666,0.120117,0.035872
2,1_1001_u.wav,0.288145,0.320339,0.182242,135.0,-77.95845,85.005333,-20.893145,31.380796,-13.386493,...,0.196793,0.072305,0.033911,0.014702,-11.572872,-6.619797,234.766572,0.150029,0.114258,0.121724
3,1_1003_u.wav,0.256163,0.257658,0.158541,165.0,-98.872879,110.46051,-17.9312,35.882313,-12.107592,...,0.166771,0.023415,0.008049,0.004693,1.352301,14.914905,174.175723,0.103435,0.074707,0.080655
4,1_10190_u.wav,0.039448,0.037176,0.029028,190.909091,-298.761658,64.262024,-43.251045,22.255568,-17.129074,...,0.00429,0.001511,0.000361,0.000111,-4.523149,5.029679,233.931902,0.168757,0.134766,0.10812


# Save dataframe

In [9]:
df_audio_features.to_csv('audio_features.csv', index=False)