In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import numpy as np
from torch.utils import data
from collections import OrderedDict
from torch.nn.parameter import Parameter
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.image import resize
from rawnet import RawNet

In [8]:
def recog(audio_file):
    model = load_model("audio_classification_model_v2.h5")
    target_shape = (128, 128)
    classes = ["AI", "Human"]

    audio_data, sample_rate = librosa.load(audio_file, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
    mel_spectrogram = tf.reshape(mel_spectrogram, (1,) + target_shape + (1,))

    predictions = model.predict(mel_spectrogram, verbose=0)
    class_probabilities = predictions[0]
    predicted_class_index = np.argmax(class_probabilities)
    predicted_class = classes[predicted_class_index]

    res_dict = {
        "voiceType": str(predicted_class),
        "confidenceScore": {
            "aiProbability": class_probabilities[0],
            "humanProbability": class_probabilities[1],
        },
    }
    return res_dict

In [9]:
def recog_rawnet(audio_file):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    d_args = {'nb_samp': 64600,'first_conv': 1024 ,'in_channels': 1,'filts': [20, [20, 20], [20, 128], [128, 128]], 'blocks': [2, 4],'nb_fc_node': 1024,'gru_node': 1024,'nb_gru_layer': 3}
    model = RawNet(d_args,device)
    model.load_state_dict(torch.load("best_model_6600.pth"))

    sample_rate = 16000
    duration = 4
    data, sr = librosa.load(audio_file, sr=sample_rate)
    if len(data) > sample_rate * duration:
        data = data[: sample_rate * duration]
    else:
        data = np.pad(data, (0, max(0, sample_rate * duration - len(data))), "constant")
    input_tensor = torch.from_numpy(data)
    input_tensor = input_tensor.to(device)
    input_tensor = input_tensor.unsqueeze(0)
    with torch.no_grad():
        output, _ = model(input_tensor)
    _, predicted = torch.max(output.data, 1)
    probabilities = torch.nn.functional.softmax(output, dim=1)

    predicted_class_indices = torch.argmax(probabilities, dim=1)
    class_names = ["AI", "Human"]
    predicted_class = [class_names[i] for i in predicted_class_indices]
    res_dict = {
        "voiceType": str(predicted_class[0]),
        "confidenceScore": {
            "aiProbability": probabilities[0][0].item(),
            "humanProbability": probabilities[0][1].item(),
        },
    }
    return res_dict

In [60]:
def combination(audio_file):
    res_v1 = recog(audio_file)
    res_v2 = recog_rawnet(audio_file)
    ai_prob = (0.5 * res_v1["confidenceScore"]["aiProbability"]+ 0.5 * res_v2["confidenceScore"]["aiProbability"])
    hum_prob = (0.5 * res_v1["confidenceScore"]["humanProbability"] + 0.5 * res_v2["confidenceScore"]["humanProbability"])
    v_type = "Human" if (hum_prob>ai_prob) else "AI" 
    res_dict = {
        "voiceType": v_type,
        "confidenceScore": {
            "aiProbability": ai_prob,
            "humanProbability": hum_prob,
        },
    }
    return res_dict

In [11]:
import os
def get_audio_files(folder_path):
    audio_files = []
    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        # Check if the file is a regular file and has an audio extension
        if os.path.isfile(file_path) and file_name.lower().endswith(
            (".wav", ".mp3")
        ):
            audio_files.append(file_path)
    return audio_files

In [13]:
folder_path = "D:\Programming\VoiceRecog\Small\Human"
audio_files = get_audio_files(folder_path)
hum = 0
print(len(audio_files))
for audio_file in audio_files:
    # print(audio_file)
    # print("v1 results:")
    # print(recog(audio_file))
    # print("v2 results:")
    res_dict = recog_rawnet(audio_file)
    if (res_dict['voiceType']=='Human'):
        hum +=1 
    print(recog_rawnet(audio_file))
    
    # print("combo results:")
    # print(combination(audio_file))
    # print("\n\n\n\n")

1250
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.8053903579711914, 'humanProbability': 0.1946096122264862}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.9935314059257507, 'humanProbability': 0.006468645762652159}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.7495461106300354, 'humanProbability': 0.2504539489746094}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.9980510473251343, 'humanProbability': 0.0019489373080432415}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.8341726064682007, 'humanProbability': 0.1658274531364441}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.993787407875061, 'humanProbability': 0.0062126461416482925}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.9986191987991333, 'humanProbability': 0.0013807760551571846}}
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.7736571431159973, 'humanProbability': 0.22634291648864746}}
{'voiceType': 'AI', 'confidenceScore': {

In [56]:
audio_file = "identity-personification/synthetic-audio-train/1.2_Audio_HM.wav"
print("v1 results:")
print(recog(audio_file))
print("v2 results:")
print(recog_rawnet(audio_file))
print("combo results:")
print(combination(audio_file))

v1 results:
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 1.0, 'humanProbability': 9.466741e-13}}
v2 results:
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.9993306398391724, 'humanProbability': 0.0006693418836221099}}
combo results:
{'voiceType': 'AI', 'confidenceScore': {'aiProbability': 0.9996653199195862, 'humanProbability': 0.00033467094228439197}}
