In [2]:
import tensorflow as tf
import librosa
import numpy as np
from pydub import AudioSegment, effects
import noisereduce as nr

  from .autonotebook import tqdm as notebook_tqdm


## Load in model

In [26]:
# Load first model
# saved_model_path = './ser_model.json'
# saved_weights_path = './ser_model_weights.h5'

# Load new, better model
saved_model_path = './binary_class.json'
saved_weights_path = './binary_class_weights.h5'

with open(saved_model_path , 'r') as json_file:
    json_savedModel = json_file.read()

model = tf.keras.models.model_from_json(json_savedModel)  # Assuming json_savedModel is defined
model.load_weights(saved_weights_path)
model.compile(loss='categorical_crossentropy', 
                optimizer='RMSProp', 
                metrics=['categorical_accuracy'])

Preprocessing

In [6]:
import tensorflow as tf
import librosa
import numpy as np
from pydub import AudioSegment, effects
import noisereduce as nr

total_length = 173056 # desired frame length for all of the audio samples.
def preprocess(file_path, frame_length = 2048, hop_length = 512):
    '''
    A process to an audio .wav file before execcuting a prediction.
      Arguments:
      - file_path - The system path to the audio file.
      - frame_length - Length of the frame over which to compute the speech features. default: 2048
      - hop_length - Number of samples to advance for each frame. default: 512

      Return:
        'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1).
    ''' 
    # Fetch sample rate.
    _, sr = librosa.load(path = file_path, sr = None)
    # Load audio file
    rawsound = AudioSegment.from_file(file_path, duration = None) 
    # Normalize to 5 dBFS 
    normalizedsound = effects.normalize(rawsound, headroom = 5.0) 
    # Transform the normalized audio to np.array of samples.
    normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
# Trim silence from the beginning and the end.
    xt, index = librosa.effects.trim(normal_x, top_db=30)
    try:
        padded_x = np.pad(xt, (0, total_length-len(xt)), 'constant')
    except:
        print("error")
        print("file:", file)
        return None
    # normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32') 
    # Noise reduction                  
    final_x = nr.reduce_noise(padded_x, sr=sr)
        
        
    f1 = librosa.feature.rms(y=final_x, frame_length=frame_length, hop_length=hop_length, center=True, pad_mode='reflect').T # Energy - Root Mean Square
    f2 = librosa.feature.zero_crossing_rate(y=final_x, frame_length=frame_length, hop_length=hop_length,center=True).T # ZCR
    f3 = librosa.feature.mfcc(y=final_x, sr=sr, S=None, n_mfcc=13, hop_length = hop_length).T # MFCC   
    X = np.concatenate((f2, f1, f3), axis = 1)
    
    X_3D = np.expand_dims(X, axis=0)
    
    return X_3D

# file_path = './test_files/copy_OAF_fail_disgust.wav'  # Update with the path to your .wav file
# file_path = './own_recordings/rec12.wav'
file_path = '../final_test/steven_happy_2.wav'

preprocessed_file = preprocess(file_path)

In [27]:
predictions = model.predict(preprocessed_file)
np.set_printoptions(suppress=True)
print("---------")
print("File:", file_path)
print(predictions)

# Assuming your model outputs a softmax distribution over emotions
emotions = ['positive', 'negative']  
print(np.argmax(predictions))
predicted_emotion = emotions[np.argmax(predictions)]
print(f"Predicted emotion: {predicted_emotion}")

---------
File: ../final_test/steven_sad_1.wav
[[0.89688563 0.10311435]]
0
Predicted emotion: positive


### Workflow for processing multiple files at once

In [28]:
import os

preprocessed_files = []
preprocessed_files_names = []
directory = '../final_test'
for file in os.listdir(directory):
    if '.DS_Store' in file:
      continue
    file_path = os.path.join(directory, file)
    processed_file = preprocess(file_path)
    if processed_file is not None:
        preprocessed_files.append(processed_file)
        preprocessed_files_names.append(file)

for i, f in enumerate(preprocessed_files):
    emotions = ['positive', 'negative']  

    predictions = model.predict(f)
    rounded_predictions = np.round(predictions[0], 3)
    # predictions_list = rounded_predictions.tolist()
    # print(predictions_list)
    np.set_printoptions(suppress=True)
    print("---------")
    print("File:", preprocessed_files_names[i])
    print(' '.join(f'{col:<8}' for col in emotions))
    print(' '.join(f'{str(val):<8}' for val in rounded_predictions))

    # Assuming your model outputs a softmax distribution over emotions
 
    print(np.argmax(predictions))
    predicted_emotion = emotions[np.argmax(predictions)]
    print(f"Predicted emotion: {predicted_emotion}")


---------
File: luke_sad_2.wav
positive negative
0.953    0.047   
0
Predicted emotion: positive
---------
File: luke_happy_1.wav
positive negative
0.41     0.59    
1
Predicted emotion: negative
---------
File: luke_sad_1.wav
positive negative
0.002    0.998   
1
Predicted emotion: negative
---------
File: luke_happy_2.wav
positive negative
0.793    0.207   
0
Predicted emotion: positive
---------
File: steven_neutral_1.wav
positive negative
0.003    0.997   
1
Predicted emotion: negative
---------
File: luke_angry_1.wav
positive negative
0.792    0.208   
0
Predicted emotion: positive
---------
File: steven_neutral_2.wav
positive negative
0.941    0.059   
0
Predicted emotion: positive
---------
File: luke_angry_2.wav
positive negative
0.53     0.47    
0
Predicted emotion: positive
---------
File: steven_angry_1.wav
positive negative
0.158    0.842   
1
Predicted emotion: negative
---------
File: steven_angry_2.wav
positive negative
0.53     0.47    
0
Predicted emotion: positive
--