Install libraries

In [None]:
!pip install librosa
!pip install tensorflow



dataset connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Preprocess and Train the Model

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
import librosa
import os
import glob


# Function to extract MFCC features from audio files
def extract_mfcc(file_path, max_pad_len=100):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        pad_width = max_pad_len - mfcc.shape[1]
        if pad_width > 0:
            mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_len]
        return mfcc
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Load data
human_files = glob.glob('/content/drive/My Drive/voice_dataset/human/*.wav')
ai_files = glob.glob('/content/drive/My Drive/voice_dataset/ai/*.wav')


human_mfccs = [extract_mfcc(file) for file in human_files]
ai_mfccs = [extract_mfcc(file) for file in ai_files]

# Filter out None values due to errors
human_mfccs = [mfcc for mfcc in human_mfccs if mfcc is not None]
ai_mfccs = [mfcc for mfcc in ai_mfccs if mfcc is not None]

# Create labels
human_labels = np.zeros(len(human_mfccs))
ai_labels = np.ones(len(ai_mfccs))

# Combine data
X = np.array(human_mfccs + ai_mfccs)
y = np.concatenate((human_labels, ai_labels))

# Shuffle data
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

# Expand dimensions to fit the model input
X = np.expand_dims(X, -1)

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model creation
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(13, 100, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 401ms/step - accuracy: 0.3162 - loss: 18.0352 - val_accuracy: 0.8462 - val_loss: 13.9130
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.8799 - loss: 11.4817 - val_accuracy: 0.8462 - val_loss: 12.5500
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8799 - loss: 10.1591 - val_accuracy: 0.8462 - val_loss: 8.1467
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.8799 - loss: 6.4477 - val_accuracy: 0.8462 - val_loss: 3.8946
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8799 - loss: 2.8291 - val_accuracy: 0.6923 - val_loss: 1.2221
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.4882 - loss: 1.2801 - val_accuracy: 0.7692 - val_loss: 0.9231
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━

Evaluate and Save the Model

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.round(y_pred).astype(int)

print(classification_report(y_test, y_pred_classes, target_names=['Human', 'AI']))

# Save the model
model.save('/content/drive/My Drive/ai_voice_recognition_model.h5')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step




              precision    recall  f1-score   support

       Human       0.00      0.00      0.00         2
          AI       0.83      0.91      0.87        11

    accuracy                           0.77        13
   macro avg       0.42      0.45      0.43        13
weighted avg       0.71      0.77      0.74        13



Load and Use the Model

In [None]:
def predict_voice_type(model, file_path):
    mfcc = extract_mfcc(file_path)
    if mfcc is None:
        return None
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = np.expand_dims(mfcc, axis=-1)
    prediction = model.predict(mfcc)
    return 'AI' if np.round(prediction) == 1 else 'Human'

# Load the saved model
loaded_model = tf.keras.models.load_model('/content/drive/My Drive/ai_voice_recognition_model.h5')

# Predict on a new file
file_path = '/content/drive/MyDrive/voice_dataset/human/musk-original.wav'
voice_type = predict_voice_type(loaded_model, file_path)
print(f'The voice is: {voice_type}')





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
The voice is: Human
