In [1]:
import numpy as np
def manipulate(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

In [3]:
# Sample audio data
sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5])

# Sample noise factor
sample_noise_factor = 0.05

# Apply augmentation
augmented_data = manipulate(sample_data, sample_noise_factor)

# Print the original and augmented data
print("Original Data:", sample_data)
print("Augmented Data:", augmented_data)


Original Data: [0.1 0.2 0.3 0.4 0.5]
Augmented Data: [0.10100212 0.27209387 0.24770854 0.38437647 0.42163954]


In [1]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize
from tensorflow.keras.models import load_model

# Step 1: Load the Data
# Define your folder structure
data_dir = '/Users/sudachk/Downloads/cats_dogs/data/'
classes = ['cat', 'dog']

# Load and preprocess audio data
def load_and_preprocess_data(data_dir, classes, target_shape=(128, 128)):
    data = []
    labels = []
    
    for i, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir, filename)
                audio_data, sample_rate = librosa.load(file_path, sr=None)
                
                # Perform preprocessing (e.g., convert to Mel spectrogram and resize)
                mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
                mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
                print(mel_spectrogram)
                data.append(mel_spectrogram)
                labels.append(i)
    
    return np.array(data), np.array(labels)

# Split data into training and testing sets
data, labels = load_and_preprocess_data(data_dir, classes) 
labels = to_categorical(labels, num_classes=len(classes))  # Convert labels to one-hot encoding
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


tf.Tensor(
[[[9.2251692e+00]
  [8.7654057e+00]
  [5.3550358e+00]
  ...
  [4.9005566e+00]
  [1.1241081e+00]
  [1.8701798e-01]]

 [[3.3728547e+00]
  [2.7357864e+00]
  [3.0335884e+00]
  ...
  [4.3269920e+00]
  [4.2865321e-01]
  [1.9584280e-01]]

 [[4.3421730e-01]
  [6.6295660e-01]
  [8.1721568e-01]
  ...
  [6.2421232e-01]
  [3.7745619e-01]
  [1.7195758e-01]]

 ...

 [[2.8427827e-04]
  [7.9414138e-04]
  [7.3328952e-04]
  ...
  [1.3638537e-04]
  [5.3319200e-05]
  [2.2383840e-05]]

 [[2.2763317e-04]
  [9.9361874e-04]
  [6.1165879e-04]
  ...
  [1.4689827e-04]
  [6.4195330e-05]
  [2.8969263e-05]]

 [[2.7415724e-04]
  [3.8014079e-04]
  [5.0225365e-04]
  ...
  [1.5754586e-04]
  [4.3283115e-05]
  [3.0331554e-05]]], shape=(128, 128, 1), dtype=float32)
tf.Tensor(
[[[2.24795267e-01]
  [1.54492044e+00]
  [1.44163632e+00]
  ...
  [1.16919827e+00]
  [1.94097376e+00]
  [8.73569310e-01]]

 [[5.29488735e-02]
  [1.37460423e+00]
  [1.57330871e+00]
  ...
  [1.17651653e+00]
  [1.21822834e+00]
  [1.12519240e+0

In [2]:
# Create a neural network model
input_shape = X_train[0].shape
input_layer = Input(shape=input_shape)
x = Conv2D(32, (3, 3), activation='relu')(input_layer)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
output_layer = Dense(len(classes), activation='softmax')(x)
model = Model(input_layer, output_layer)

In [8]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 128, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 126, 126, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 63, 63, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 30, 30, 64)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 57600)             0     

In [4]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x16e5e5b10>

In [5]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.9464285969734192


In [6]:
# Save the model
model.save('audio_classification_model.h5')

  saving_api.save_model(


In [7]:
# Load the saved model
model = load_model('audio_classification_model.h5')

# Define the target shape for input spectrograms
target_shape = (128, 128)

# Define your class labels
classes = ['cat', 'dog']

# Function to preprocess and classify an audio file
def test_audio(file_path, model):
    # Load and preprocess the audio file
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
    mel_spectrogram = tf.reshape(mel_spectrogram, (1,) + target_shape + (1,))
    
    # Make predictions
    predictions = model.predict(mel_spectrogram)
    
    # Get the class probabilities
    class_probabilities = predictions[0]
    
    # Get the predicted class index
    predicted_class_index = np.argmax(class_probabilities)
    
    return class_probabilities, predicted_class_index

# Test an audio file
test_audio_file = '/Users/sudachk/Downloads/cat-meow-14536.mp3'
class_probabilities, predicted_class_index = test_audio(test_audio_file, model)

# Display results for all classes
for i, class_label in enumerate(classes):
    probability = class_probabilities[i]
    print(f'Class: {class_label}, Probability: {probability:.4f}')

# Calculate and display the predicted class and accuracy
predicted_class = classes[predicted_class_index]
accuracy = class_probabilities[predicted_class_index]
print(f'The audio is classified as: {predicted_class}')
print(f'Accuracy: {accuracy:.4f}')



Class: cat, Probability: 0.5721
Class: dog, Probability: 0.4279
The audio is classified as: cat
Accuracy: 0.5721


In [None]:
augmented data:

In [6]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize
from tensorflow.keras.models import load_model

# Step 1: Load the Data
# Define your folder structure
data_dir = '/Users/sudachk/Downloads/cats_dogs/data/'
classes = ['cat', 'dog']

# Load and preprocess audio data
def load_and_preprocess_data(data_dir, classes, target_shape=(128, 128)):
    data = []
    labels = []
    noise_factor = 0.05
    for i, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir, filename)
                audio_data, sample_rate = librosa.load(file_path, sr=None)
                
                # Apply noise manipulation
                noise = np.random.randn(len(audio_data))
                augmented_data = audio_data + noise_factor * noise
                augmented_data = augmented_data.astype(type(audio_data[0]))

                # Perform preprocessing (e.g., convert to Mel spectrogram and resize)
                mel_spectrogram = librosa.feature.melspectrogram(y=augmented_data, sr=sample_rate)
                mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
                print(mel_spectrogram)
                
                data.append(mel_spectrogram)
                labels.append(i)
    return np.array(data), np.array(labels)

# Split data into training and testing sets
data, labels = load_and_preprocess_data(data_dir, classes)
labels = to_categorical(labels, num_classes=len(classes))  # Convert labels to one-hot encoding
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


tf.Tensor(
[[[10.234426  ]
  [ 8.661715  ]
  [ 5.0362244 ]
  ...
  [ 5.9615183 ]
  [ 1.1399472 ]
  [ 0.20922863]]

 [[ 3.8573766 ]
  [ 2.103331  ]
  [ 3.4637294 ]
  ...
  [ 5.965196  ]
  [ 0.4591289 ]
  [ 0.6501214 ]]

 [[ 0.6581111 ]
  [ 0.665575  ]
  [ 0.70397055]
  ...
  [ 0.86702555]
  [ 0.60585946]
  [ 0.43330064]]

 ...

 [[ 0.14381087]
  [ 0.2268291 ]
  [ 0.25566202]
  ...
  [ 0.19384345]
  [ 0.1763987 ]
  [ 0.1850747 ]]

 [[ 0.11831849]
  [ 0.22455887]
  [ 0.19624564]
  ...
  [ 0.19850992]
  [ 0.24924815]
  [ 0.197321  ]]

 [[ 0.13376595]
  [ 0.28992268]
  [ 0.27195916]
  ...
  [ 0.23992342]
  [ 0.259494  ]
  [ 0.1506659 ]]], shape=(128, 128, 1), dtype=float32)
tf.Tensor(
[[[0.69717693]
  [1.7216558 ]
  [1.1221879 ]
  ...
  [1.221904  ]
  [2.205539  ]
  [0.9052601 ]]

 [[0.4157585 ]
  [1.326791  ]
  [1.0239575 ]
  ...
  [0.87474847]
  [1.1598845 ]
  [0.8450414 ]]

 [[0.29315484]
  [1.6552947 ]
  [1.3592356 ]
  ...
  [1.2477746 ]
  [2.0224402 ]
  [1.6642436 ]]

 ...

 [[0.287589

In [7]:
# Create a neural network model
input_shape = X_train[0].shape
input_layer = Input(shape=input_shape)
x = Conv2D(32, (3, 3), activation='relu')(input_layer)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
output_layer = Dense(len(classes), activation='softmax')(x)
model = Model(input_layer, output_layer)

In [8]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 128, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 126, 126, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 63, 63, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 30, 30, 64)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 57600)             0     

In [9]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x284f419d0>

In [10]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.9642857313156128


In [11]:
# Load the saved model
model = load_model('audio_classification_model.h5')

# Define the target shape for input spectrograms
target_shape = (128, 128)

# Define your class labels
classes = ['cat', 'dog']

# Function to preprocess and classify an audio file
def test_audio(file_path, model):
    # Load and preprocess the audio file
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
    mel_spectrogram = tf.reshape(mel_spectrogram, (1,) + target_shape + (1,))
    
    # Make predictions
    predictions = model.predict(mel_spectrogram)
    
    # Get the class probabilities
    class_probabilities = predictions[0]
    
    # Get the predicted class index
    predicted_class_index = np.argmax(class_probabilities)
    
    return class_probabilities, predicted_class_index

# Test an audio file
test_audio_file = '/Users/sudachk/Downloads/cat-meow-14536.mp3'
class_probabilities, predicted_class_index = test_audio(test_audio_file, model)

# Display results for all classes
for i, class_label in enumerate(classes):
    probability = class_probabilities[i]
    print(f'Class: {class_label}, Probability: {probability:.4f}')

# Calculate and display the predicted class and accuracy
predicted_class = classes[predicted_class_index]
accuracy = class_probabilities[predicted_class_index]
print(f'The audio is classified as: {predicted_class}')
print(f'Accuracy: {accuracy:.4f}')







Class: cat, Probability: 0.5721
Class: dog, Probability: 0.4279
The audio is classified as: cat
Accuracy: 0.5721
