<a href="https://colab.research.google.com/github/Vishal8500/Parkinson-Disease-Prediction/blob/main/Speech_parkinson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import os
import numpy as np
import random
import librosa
import librosa.display
import cv2
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense



import random
from tensorflow.keras.layers import Dense, Flatten, Input
import matplotlib.pyplot as plt
# Define dataset paths
pd_path = "/content/drive/MyDrive/parkinson_voice/PD_AH"  # Parkinson's audio folder
hc_path = "/content/drive/MyDrive/parkinson_voice/HC_AH"  # Healthy audio folder


print("✅ Step 1: Libraries imported and dataset paths defined.")


✅ Step 1: Libraries imported and dataset paths defined.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import librosa
import cv2

def time_shift(audio):
    shift_max = 0.2 * len(audio)
    shift = int(np.random.uniform(-shift_max, shift_max))
    return np.roll(audio, shift)

def add_noise(audio, noise_level=0.02):
    return audio + noise_level * np.random.randn(len(audio))

def time_stretch(audio, rate=1.1):
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=0)  # Convert to mono if stereo
    return librosa.effects.time_stretch(y=audio.astype(np.float32), rate=rate)

def pitch_shift(audio, sr, n_steps=2):
    return librosa.effects.pitch_shift(y=audio.astype(np.float32), sr=sr, n_steps=n_steps)

def load_audio_to_mel_spectrogram(filepath, target_size=(224, 224)):
    y, sr = librosa.load(filepath, sr=22050)

    augmented_audios = [
        y,
        time_shift(y),
        add_noise(y),
        time_stretch(y),
        pitch_shift(y, sr)
    ]

    augmented_spectrograms = []
    for aug_audio in augmented_audios:
        mel_spec = librosa.feature.melspectrogram(y=aug_audio, sr=sr, n_mels=128, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_resized = cv2.resize(mel_spec_db, target_size, interpolation=cv2.INTER_CUBIC)
        mel_spec_resized = np.stack([mel_spec_resized] * 3, axis=-1)  # Convert grayscale to RGB
        augmented_spectrograms.append(mel_spec_resized)

    return augmented_spectrograms

def process_audio_files(folder_path, class_label, target_samples=500):
    print(f"Processing folder: {folder_path}...")

    spectrograms = []
    files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]

    for file in files:
        filepath = os.path.join(folder_path, file)

        if not os.path.isfile(filepath):
            print(f"❌ Skipping invalid file: {filepath}")
            continue

        try:
            spects = load_audio_to_mel_spectrogram(filepath)
            spectrograms.extend(spects)
        except Exception as e:
            print(f"❌ Error processing {filepath}: {e}")

    # Ensure we have at least `target_samples` by augmenting if necessary
    while len(spectrograms) < target_samples:
        for file in files:
            if len(spectrograms) >= target_samples:
                break
            filepath = os.path.join(folder_path, file)
            try:
                spects = load_audio_to_mel_spectrogram(filepath)
                spectrograms.append(spects[np.random.randint(len(spects))])  # Randomly select one augmentation
            except Exception as e:
                print(f"❌ Error processing {filepath}: {e}")

    print(f"✅ {folder_path}: {len(spectrograms)} samples generated.")
    return spectrograms[:target_samples], [class_label] * target_samples


In [3]:
X_hc, y_hc = process_audio_files(hc_path, 0)
X_pd, y_pd = process_audio_files(pd_path, 1)

X = np.array(X_hc + X_pd)
y = np.array(y_hc + y_pd)

print("✅ Step 4: Data Loaded Successfully!")
print(f"Final dataset shape: {X.shape}, Labels: {y.shape}")


Processing folder: /content/drive/MyDrive/parkinson_voice/HC_AH...
✅ /content/drive/MyDrive/parkinson_voice/HC_AH: 500 samples generated.
Processing folder: /content/drive/MyDrive/parkinson_voice/PD_AH...
✅ /content/drive/MyDrive/parkinson_voice/PD_AH: 500 samples generated.
✅ Step 4: Data Loaded Successfully!
Final dataset shape: (1000, 224, 224, 3), Labels: (1000,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"✅ Step 5: Data split completed!")
print(f"Training Samples: {len(X_train)}, Testing Samples: {len(X_test)}")


✅ Step 5: Data split completed!
Training Samples: 800, Testing Samples: 200


In [7]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout
from tensorflow.keras.regularizers import l2
import tensorflow as tf

def build_resnet_model():
    print("Initializing ResNet50 Model...")
    base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3)))

    x = Flatten()(base_model.output)
    x = Dense(512, activation='relu', kernel_regularizer=l2(0.002))(x)
    x = Dropout(0.5)(x)  # Dropout to prevent overfitting
    output = Dense(1, activation='sigmoid')(x)  # Binary classification

    model = Model(inputs=base_model.input, outputs=output)

    # Unfreeze last few layers for fine-tuning
    for layer in base_model.layers[-40:]:
        layer.trainable = True

    print("✅ Step 6: ResNet50 Model Built Successfully with Fine-Tuning!")
    return model

# Build and compile the model
model = build_resnet_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), loss='binary_crossentropy', metrics=['accuracy'])


Initializing ResNet50 Model...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
✅ Step 6: ResNet50 Model Built Successfully with Fine-Tuning!


In [8]:
print("🚀 Training Model...")
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

print("✅ Step 7: Model Training Completed!")


🚀 Training Model...
Epoch 1/10


Expected: ['keras_tensor_12']
Received: inputs=Tensor(shape=(16, 224, 224, 3))


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step - accuracy: 0.6607 - loss: 2.9201

Expected: ['keras_tensor_12']
Received: inputs=Tensor(shape=(None, 224, 224, 3))


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 355ms/step - accuracy: 0.6626 - loss: 2.9146 - val_accuracy: 0.4850 - val_loss: 4.2484
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 195ms/step - accuracy: 0.9612 - loss: 2.1057 - val_accuracy: 0.5000 - val_loss: 3.4311
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 196ms/step - accuracy: 0.9794 - loss: 2.0373 - val_accuracy: 0.5300 - val_loss: 3.6280
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 197ms/step - accuracy: 0.9936 - loss: 2.0050 - val_accuracy: 0.5950 - val_loss: 3.2833
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 194ms/step - accuracy: 0.9980 - loss: 1.9817 - val_accuracy: 0.6250 - val_loss: 3.0278
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 193ms/step - accuracy: 0.9959 - loss: 1.9679 

WITH K-FOLD

In [13]:
import os
import numpy as np
import librosa
import librosa.display
import cv2
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import KFold

# Data Augmentation Functions (controlled augmentation)
def time_shift(audio):
    shift_max = 0.1 * len(audio)
    shift = int(np.random.uniform(-shift_max, shift_max))
    return np.roll(audio, shift)

def add_noise(audio, noise_level=0.015):
    return audio + noise_level * np.random.randn(len(audio))

def time_stretch(audio, rate=1.05):
    return librosa.effects.time_stretch(y=audio.astype(np.float32), rate=rate)

def pitch_shift(audio, sr, n_steps=1.5):
    return librosa.effects.pitch_shift(y=audio.astype(np.float32), sr=sr, n_steps=n_steps)

# Convert WAV to Mel Spectrogram
def load_audio_to_mel_spectrogram(filepath, target_size=(224, 224)):
    y, sr = librosa.load(filepath, sr=22050)
    augmented_audios = [
        y,  # Original
        time_shift(y),
        add_noise(y),
        time_stretch(y),
        pitch_shift(y, sr)
    ]

    augmented_spectrograms = []
    for aug_audio in augmented_audios:
        mel_spec = librosa.feature.melspectrogram(y=aug_audio, sr=sr, n_mels=128, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_resized = cv2.resize(mel_spec_db, target_size, interpolation=cv2.INTER_CUBIC)
        mel_spec_resized = np.stack([mel_spec_resized] * 3, axis=-1)  # Convert grayscale to RGB
        augmented_spectrograms.append(mel_spec_resized)
    return augmented_spectrograms

# Process audio files
def process_audio_files(folder_path, class_label, max_samples=100):
    print(f"Processing folder: {folder_path}...")
    spectrograms = []
    files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]

    for file in files:
        filepath = os.path.join(folder_path, file)
        if not os.path.isfile(filepath):
            print(f"❌ Skipping invalid file: {filepath}")
            continue
        try:
            augmented_specs = load_audio_to_mel_spectrogram(filepath)
            spectrograms.extend(augmented_specs)
        except Exception as e:
            print(f"❌ Error processing {filepath}: {e}")

    # Limit number of samples per class
    if len(spectrograms) > max_samples:
        spectrograms = spectrograms[:max_samples]

    print(f"✅ {folder_path}: {len(spectrograms)} samples generated.")
    return spectrograms, [class_label] * len(spectrograms)



# Process data
X_hc, y_hc = process_audio_files(hc_path, 0, max_samples=100)
X_pd, y_pd = process_audio_files(pd_path, 1, max_samples=100)

X = np.array(X_hc + X_pd)
y = np.array(y_hc + y_pd)

# Define CNN model with L2 regularization and dropout
def build_model():
    model = Sequential([
        Conv2D(32, (3,3), activation='relu', kernel_regularizer=l2(0.05), input_shape=(224, 224, 3)),
        MaxPooling2D((2,2)),
        Dropout(0.5),

        Conv2D(64, (3,3), activation='relu', kernel_regularizer=l2(0.05)),
        MaxPooling2D((2,2)),
        Dropout(0.5),

        Flatten(),
        Dense(128, activation='relu', kernel_regularizer=l2(0.05)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Callbacks
lr_decay = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
for train_idx, val_idx in kf.split(X):
    print(f"Training Fold {fold_no}...")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = build_model()
    model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=16,
        validation_data=(X_val, y_val),
        callbacks=[lr_decay, early_stopping]
    )

    fold_no += 1


Processing folder: /content/drive/MyDrive/parkinson_voice/HC_AH...
✅ /content/drive/MyDrive/parkinson_voice/HC_AH: 100 samples generated.
Processing folder: /content/drive/MyDrive/parkinson_voice/PD_AH...
✅ /content/drive/MyDrive/parkinson_voice/PD_AH: 100 samples generated.
Training Fold 1...
Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 230ms/step - accuracy: 0.5491 - loss: 502.5205 - val_accuracy: 0.4750 - val_loss: 23.6459 - learning_rate: 0.0010
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 55ms/step - accuracy: 0.4810 - loss: 82.7507 - val_accuracy: 0.5250 - val_loss: 28.1467 - learning_rate: 0.0010
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.5636 - loss: 29.9011 - val_accuracy: 0.5250 - val_loss: 29.5819 - learning_rate: 0.0010
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.6129 - loss: 29.4988 - val_accuracy: 0.5250

CNN

In [5]:
def build_cnn_model():
    model = keras.Sequential([
        keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(224, 224, 3)),
        keras.layers.MaxPooling2D((2,2)),
        keras.layers.Conv2D(64, (3,3), activation='relu'),
        keras.layers.MaxPooling2D((2,2)),
        keras.layers.Conv2D(128, (3,3), activation='relu'),
        keras.layers.MaxPooling2D((2,2)),
        keras.layers.Conv2D(224, (3,3), activation='relu'),
        keras.layers.MaxPooling2D((2,2)),
        keras.layers.Flatten(),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    return model

cnn_model = build_cnn_model()
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history_cnn = cnn_model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 89ms/step - accuracy: 0.5411 - loss: 4.5446 - val_accuracy: 0.5850 - val_loss: 0.6714
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.6462 - loss: 0.6407 - val_accuracy: 0.7900 - val_loss: 0.4874
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.7764 - loss: 0.5023 - val_accuracy: 0.8450 - val_loss: 0.3313
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.8532 - loss: 0.3213 - val_accuracy: 0.9150 - val_loss: 0.2169
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.9226 - loss: 0.1966 - val_accuracy: 0.9000 - val_loss: 0.2288
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.9706 - loss: 0.1077 - val_accuracy: 0.8850 - val_loss: 0.2570
Epoch 7/20
[1m50/50[0m [32m━━━

In [14]:
loss_cnn, acc_cnn = cnn_model.evaluate(X_test, y_test)
print(f"CNN Test Accuracy: {acc_cnn*100:.2f}%")

loss_resnet, acc_resnet = model.evaluate(X_test, y_test)
print(f"ResNet50 Test Accuracy: {acc_resnet*100:.2f}%")


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.9863 - loss: 0.0985
CNN Test Accuracy: 97.50%
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.4602 - loss: 16.1478
ResNet50 Test Accuracy: 48.00%


# Using Resnet50


In [16]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load ResNet50 with pretrained weights (excluding top layers)
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze all layers except the last 10 for fine-tuning
for layer in base_model.layers[:-10]:
    layer.trainable = False

# Add custom classifier layers
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Converts feature maps to feature vectors
x = Dense(128, activation='relu')(x)  # Fully connected layer
x = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

# Create model
model = Model(inputs=base_model.input, outputs=x)

# Compile model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Print summary
model.summary()


Final Try using a custon CNN


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import librosa
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader, random_split
import torchaudio.transforms as T

# Define AudioCNN Model
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()

        # Determine fully connected input size dynamically
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, 40, 100)
            dummy_output = self._get_conv_output(dummy_input)
            fc_input_size = dummy_output.shape[1]

        self.fc1 = nn.Linear(fc_input_size, 128)  # Feature layer
        self.fc2 = nn.Linear(128, 2)  # Classification layer

    def _get_conv_output(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten
        return x

    def forward(self, x, return_features=False):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten
        features = self.relu(self.fc1(x))

        if return_features:
            return features  # Extract feature vector

        x = self.fc2(features)  # Final classification
        return x


In [18]:
class ParkinsonsDataset(Dataset):
    def __init__(self, folder_path, label, max_length=100):
        self.files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.wav')]
        self.label = label
        self.max_length = max_length

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        mfccs = self.preprocess_audio(file_path)
        return torch.tensor(mfccs, dtype=torch.float32), torch.tensor(self.label, dtype=torch.long)

    def preprocess_audio(self, file_path, target_sr=22050):
        y, sr = librosa.load(file_path, sr=target_sr)
        y = librosa.effects.trim(y)[0]  # Trim silence
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

        # Pad or truncate to fixed length
        if mfccs.shape[1] < self.max_length:
            pad_width = self.max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :self.max_length]

        return mfccs


In [20]:
# Define paths (update with your actual paths in Google Drive)
HC_PATH = "/content/drive/MyDrive/parkinson_voice/HC_AH"
PD_PATH = "/content/drive/MyDrive/parkinson_voice/PD_AH"

# Create dataset instances
hc_dataset = ParkinsonsDataset(HC_PATH, label=0)
pd_dataset = ParkinsonsDataset(PD_PATH, label=1)

# Combine datasets
dataset = hc_dataset + pd_dataset

# Split dataset into Train (80%) & Test (20%)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [21]:
# Train function
def train_model(model, train_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.unsqueeze(1).to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_acc = 100 * correct / total
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

# Test function
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.unsqueeze(1).to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    test_acc = 100 * correct / total
    print(f"Test Accuracy: {test_acc:.2f}%")
    return test_acc

# Train and evaluate
train_model(model, train_loader)
test_acc = test_model(model, test_loader)


Epoch 1, Loss: 31.1877, Train Accuracy: 48.44%
Epoch 2, Loss: 5.4708, Train Accuracy: 50.00%
Epoch 3, Loss: 0.7111, Train Accuracy: 62.50%
Epoch 4, Loss: 0.6774, Train Accuracy: 54.69%
Epoch 5, Loss: 0.6367, Train Accuracy: 67.19%
Epoch 6, Loss: 0.6118, Train Accuracy: 67.19%
Epoch 7, Loss: 0.5370, Train Accuracy: 73.44%
Epoch 8, Loss: 0.4710, Train Accuracy: 78.12%
Epoch 9, Loss: 0.3877, Train Accuracy: 79.69%
Epoch 10, Loss: 0.2744, Train Accuracy: 90.62%
Test Accuracy: 52.94%


In [22]:
def extract_features(model, dataloader):
    model.eval()
    feature_list = []
    label_list = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.unsqueeze(1).to(device)
            features = model(inputs, return_features=True)  # Extract features
            feature_list.append(features.cpu().numpy())
            label_list.append(labels.cpu().numpy())

    feature_array = np.vstack(feature_list)
    label_array = np.hstack(label_list)

    return feature_array, label_array

# Extract features
feature_model = model  # The trained model is already modified for feature extraction
features, labels = extract_features(feature_model, DataLoader(dataset, batch_size=16))

# Save features & labels as .npy files
feature_save_path = "/content/audio_features.npy"
label_save_path = "/content/audio_labels.npy"

np.save(feature_save_path, features)
np.save(label_save_path, labels)

print(f"Feature vectors saved at: {feature_save_path}")
print(f"Labels saved at: {label_save_path}")


Feature vectors saved at: /content/audio_features.npy
Labels saved at: /content/audio_labels.npy
