In [75]:
import tensorflow as tf
import librosa
import numpy as np
import pickle

In [83]:
def pipeline_model_1(audio_path):
    model = tf.keras.models.load_model('Models//cnn_model_3.h5')

    with open('Models//scaler_3.pickle', 'rb') as f:
        sc = pickle.load(f)

    # Load audio file using librosa
    signal,sr = librosa.load(audio_path, offset=0.5, res_type='kaiser_fast', sr=22050)

    # Calculate the duration in samples for 3 seconds
    segment_length_samples = sr * 3

    predictions = []

    # Iterate over the audio signal with a step of 3 seconds
    for i in range(0, len(signal), segment_length_samples):
        # Slice the audio
        segment = signal[i:i + segment_length_samples]

        # Extract features from the audio
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
        features = np.mean(mfccs.T, axis=0)
        features = sc.transform(features.reshape(1,-1))
        features = np.expand_dims(features, axis=2)

        # Predict the emotion
        predictions.append(model.predict(features)[0][0])

    predictions_2 = [1 if prediction > 0.451380 else 0 for prediction in predictions]
    if predictions_2.count(1) == predictions_2.count(0):
        prediction = np.mean(predictions)
        return ['Aggressive' if prediction > 0.451380 else 'Non-Aggressive'][0],len(predictions)
    else:
        if predictions.count(1) > predictions.count(0):
            prediction = "Aggressive"
        else:
            prediction = "Non-Aggressive"
        return prediction,len(predictions)

In [79]:
def pipeline_model_2(audio_path):
    model = tf.keras.models.load_model('Models//cnn_model_4(pro).h5')

    with open('Models//scaler_4(pro).pickle', 'rb') as f:
        sc = pickle.load(f)

    # Load audio file using librosa
    signal,sr = librosa.load(audio_path, offset=0.5, res_type='kaiser_fast', sr=22050)

    # Calculate the duration in samples for 3 seconds
    segment_length_samples = sr * 3

    predictions = []

    # Iterate over the audio signal with a step of 3 seconds
    for i in range(0, len(signal), segment_length_samples):
        # Slice the audio
        segment = signal[i:i + segment_length_samples]

        features = []
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
        features.extend(np.mean(mfccs.T, axis=0))
        features.append(np.sum(np.abs(signal)**2))
        features.append(librosa.feature.rms(y=signal)[0].mean())
        features.append(np.max(np.abs(signal)))
        features.append(librosa.beat.beat_track(y=signal, sr=22050)[0])

        features = np.array(features)
        # Scale data
        features = sc.transform(features.reshape(1,-1))
        features = np.expand_dims(features, axis=2)

        # Predict the emotion
        predictions.append(model.predict(features)[0][0])

    predictions_2 = [1 if prediction > 0.737163 else 0 for prediction in predictions]
    if predictions_2.count(1) == predictions_2.count(0):
        prediction = np.mean(predictions)
        return ['Aggressive' if prediction > 0.737163 else 'Non-Aggressive'][0],len(predictions)
    else:
        if predictions.count(1) > predictions.count(0):
            prediction = "Aggressive"
        else:
            prediction = "Non-Aggressive"
        return prediction,len(predictions)

In [86]:
# Create a dataframe to store the results with the audio file name and the prediction with pipeline 1, pipeline 3 and real label
import pandas as pd
import os

df = pd.DataFrame(columns=['Audio File', 'Model 1','Model 1 Segments', 'Model 2', 'Model 2 Segments', 'Real Label'])

# Iterate over the audio files and store the results in the dataframe
audio_folder = 'Test Audio'

dfs = []

for audio in os.listdir(audio_folder):
    if audio.endswith('.wav'):
        real_label = audio.split('-')[0]
        pipeline_1, model_1_segments = pipeline_model_1(f'{audio_folder}//{audio}')
        pipeline_3, model_2_segments = pipeline_model_2(f'{audio_folder}//{audio}')
        
        dict = {'Audio File': audio, 'Model 1': pipeline_1,'Model 1 Segments':model_1_segments, 'Model 2': pipeline_3,'Model 2 Segments':model_2_segments, 'Real Label': real_label}

        df_row =pd.DataFrame([dict])
        dfs.append(df_row)

df = pd.concat(dfs, ignore_index=True)

df



Unnamed: 0,Audio File,Model 1,Model 1 Segments,Model 2,Model 2 Segments,Real Label
0,agg-female-1.wav,Aggressive,2,Aggressive,2,agg
1,agg-female-2.wav,Non-Aggressive,3,Aggressive,3,agg
2,agg-female-3.wav,Non-Aggressive,4,Aggressive,4,agg
3,agg-female-4.wav,Non-Aggressive,2,Aggressive,2,agg
4,agg-male-1.wav,Non-Aggressive,3,Non-Aggressive,3,agg
5,non-female-1.wav,Non-Aggressive,3,Non-Aggressive,3,non
6,non-female-2.wav,Aggressive,2,Aggressive,2,non
7,non-female-3.wav,Non-Aggressive,1,Non-Aggressive,1,non
8,non-female-4.wav,Non-Aggressive,2,Non-Aggressive,2,non
9,non-male-1.wav,Non-Aggressive,2,Non-Aggressive,2,non
